From 7acb585f321f7560488470a0e6a06b916639fc8c Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Mon, 2 Feb 2026 09:24:56 +0100 Subject: [PATCH 01/10] update workflow to restart the pod add new files add new configs cleanup cleanup fix the tests fix black fix black resolve conflict use different models and fix the logic clean up comments from run.yaml remove generated file simplify the logic cleanup --- dev-tools/mcp-mock-server/server.py | 73 ++++-- .../lightspeed-stack-auth-noop-token.yaml | 31 +++ .../lightspeed-stack-auth-rh-identity.yaml | 25 ++ ...tspeed-stack-invalid-feedback-storage.yaml | 25 ++ .../configs/lightspeed-stack-no-cache.yaml | 27 ++ .../rhoai/configs/lightspeed-stack-rbac.yaml | 94 +++++++ .../rhoai/configs/lightspeed-stack.yaml | 18 ++ tests/e2e-prow/rhoai/configs/run.yaml | 62 +++-- .../manifests/lightspeed/llama-stack.yaml | 27 ++ .../manifests/lightspeed/mcp-mock-server.yaml | 50 ++++ .../rhoai/manifests/lightspeed/mock-jwks.yaml | 46 ++++ .../rhoai/manifests/test-pod/spin-up.yaml | 30 --- .../manifests/vllm/vllm-runtime-cpu.yaml | 2 +- .../manifests/vllm/vllm-runtime-gpu.yaml | 4 +- tests/e2e-prow/rhoai/pipeline-services.sh | 6 +- tests/e2e-prow/rhoai/pipeline.sh | 179 +++++++++----- tests/e2e-prow/rhoai/run-tests.sh | 38 ++- tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 234 ++++++++++++++++++ tests/e2e/features/environment.py | 173 +++++++------ tests/e2e/features/steps/health.py | 9 + .../e2e/features/steps/llm_query_response.py | 4 +- tests/e2e/features/steps/rbac.py | 5 +- tests/e2e/utils/prow_utils.py | 215 ++++++++++++++++ tests/e2e/utils/utils.py | 32 +++ 24 files changed, 1182 insertions(+), 227 deletions(-) create mode 100644 tests/e2e-prow/rhoai/configs/lightspeed-stack-auth-noop-token.yaml create mode 100644 tests/e2e-prow/rhoai/configs/lightspeed-stack-auth-rh-identity.yaml create mode 100644 tests/e2e-prow/rhoai/configs/lightspeed-stack-invalid-feedback-storage.yaml create mode 100644 tests/e2e-prow/rhoai/configs/lightspeed-stack-no-cache.yaml create mode 100644 tests/e2e-prow/rhoai/configs/lightspeed-stack-rbac.yaml create mode 100644 tests/e2e-prow/rhoai/manifests/lightspeed/mcp-mock-server.yaml create mode 100644 tests/e2e-prow/rhoai/manifests/lightspeed/mock-jwks.yaml delete mode 100644 tests/e2e-prow/rhoai/manifests/test-pod/spin-up.yaml mode change 100644 => 100755 tests/e2e-prow/rhoai/run-tests.sh create mode 100755 tests/e2e-prow/rhoai/scripts/e2e-ops.sh create mode 100644 tests/e2e/utils/prow_utils.py diff --git a/dev-tools/mcp-mock-server/server.py b/dev-tools/mcp-mock-server/server.py index b7e17fffb..a6f09dcbb 100644 --- a/dev-tools/mcp-mock-server/server.py +++ b/dev-tools/mcp-mock-server/server.py @@ -6,16 +6,19 @@ useful for validating that Lightspeed Core Stack correctly sends auth headers to MCP servers. -The server runs both HTTP and HTTPS simultaneously on consecutive ports. +The server runs HTTP and optionally HTTPS on consecutive ports. +Set MCP_HTTP_ONLY=true to disable HTTPS (useful when openssl is unavailable). Usage: python server.py [http_port] Example: python server.py 3000 # HTTP on 3000, HTTPS on 3001 + MCP_HTTP_ONLY=true python server.py 3000 # HTTP only on 3000 """ import json +import os import ssl import subprocess import sys @@ -268,31 +271,43 @@ def run_https_server(port: int, httpd: HTTPServer) -> None: def main() -> None: - """Start the mock MCP server with both HTTP and HTTPS.""" + """Start the mock MCP server with HTTP and optionally HTTPS.""" http_port = int(sys.argv[1]) if len(sys.argv) > 1 else 3000 - https_port = http_port + 1 + http_only = os.environ.get("MCP_HTTP_ONLY", "").lower() in ("true", "1", "yes") # Create HTTP server http_server = HTTPServer(("", http_port), MCPMockHandler) - # Create HTTPS server with self-signed certificate - https_server = HTTPServer(("", https_port), MCPMockHandler) - - # Generate or load self-signed certificate - script_dir = Path(__file__).parent - cert_dir = script_dir / ".certs" - cert_file, key_file = generate_self_signed_cert(cert_dir) - - # Wrap socket with SSL - context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) - context.load_cert_chain(cert_file, key_file) - https_server.socket = context.wrap_socket(https_server.socket, server_side=True) + https_server = None + if not http_only: + try: + https_port = http_port + 1 + https_server = HTTPServer(("", https_port), MCPMockHandler) + + # Generate or load self-signed certificate + script_dir = Path(__file__).parent + cert_dir = script_dir / ".certs" + cert_file, key_file = generate_self_signed_cert(cert_dir) + + # Wrap socket with SSL + context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) + context.load_cert_chain(cert_file, key_file) + https_server.socket = context.wrap_socket( + https_server.socket, server_side=True + ) + except (subprocess.CalledProcessError, FileNotFoundError, OSError) as e: + print(f"HTTPS setup failed ({e}), running HTTP only") + https_server = None print("=" * 70) - print("MCP Mock Server starting with HTTP and HTTPS") + if https_server: + print("MCP Mock Server starting with HTTP and HTTPS") + else: + print("MCP Mock Server starting (HTTP only)") print("=" * 70) print(f"HTTP: http://localhost:{http_port}") - print(f"HTTPS: https://localhost:{https_port}") + if https_server: + print(f"HTTPS: https://localhost:{https_port}") print("=" * 70) print("Debug endpoints:") print(" • /debug/headers - View captured headers") @@ -300,29 +315,35 @@ def main() -> None: print("MCP endpoint:") print(" • POST to any path (e.g., / or /mcp/v1/list_tools)") print("=" * 70) - print("Note: HTTPS uses a self-signed certificate (for testing only)") + if https_server: + print("Note: HTTPS uses a self-signed certificate (for testing only)") print("Press Ctrl+C to stop") print() - # Start servers in separate threads + # Start HTTP server in a thread http_thread = threading.Thread( target=run_http_server, args=(http_port, http_server), daemon=True ) - https_thread = threading.Thread( - target=run_https_server, args=(https_port, https_server), daemon=True - ) - http_thread.start() - https_thread.start() + + # Start HTTPS server if available + https_thread = None + if https_server: + https_thread = threading.Thread( + target=run_https_server, args=(https_port, https_server), daemon=True + ) + https_thread.start() try: # Keep main thread alive http_thread.join() - https_thread.join() + if https_thread: + https_thread.join() except KeyboardInterrupt: print("\nShutting down mock servers...") http_server.shutdown() - https_server.shutdown() + if https_server: + https_server.shutdown() if __name__ == "__main__": diff --git a/tests/e2e-prow/rhoai/configs/lightspeed-stack-auth-noop-token.yaml b/tests/e2e-prow/rhoai/configs/lightspeed-stack-auth-noop-token.yaml new file mode 100644 index 000000000..4dfd3ed4e --- /dev/null +++ b/tests/e2e-prow/rhoai/configs/lightspeed-stack-auth-noop-token.yaml @@ -0,0 +1,31 @@ +name: Lightspeed Core Service (LCS) +service: + host: 0.0.0.0 + port: 8080 + auth_enabled: false + workers: 1 + color_log: true + access_log: true +llama_stack: + # Uses a remote llama-stack service + # The instance would have already been started with a llama-stack-run.yaml file + use_as_library_client: false + # Alternative for "as library use" + # use_as_library_client: true + # library_client_config_path: + url: http://${env.E2E_LLAMA_HOSTNAME}:8321 + api_key: xyzzy +user_data_collection: + feedback_enabled: true + feedback_storage: "/tmp/data/feedback" + transcripts_enabled: true + transcripts_storage: "/tmp/data/transcripts" + +# Conversation cache for storing Q&A history +conversation_cache: + type: "sqlite" + sqlite: + db_path: "/tmp/data/conversation-cache.db" + +authentication: + module: "noop-with-token" diff --git a/tests/e2e-prow/rhoai/configs/lightspeed-stack-auth-rh-identity.yaml b/tests/e2e-prow/rhoai/configs/lightspeed-stack-auth-rh-identity.yaml new file mode 100644 index 000000000..e2b468cf0 --- /dev/null +++ b/tests/e2e-prow/rhoai/configs/lightspeed-stack-auth-rh-identity.yaml @@ -0,0 +1,25 @@ +name: Lightspeed Core Service (LCS) - RH Identity Auth +service: + host: 0.0.0.0 + port: 8080 + auth_enabled: true + workers: 1 + color_log: true + access_log: true +llama_stack: + use_as_library_client: false + url: http://${env.E2E_LLAMA_HOSTNAME}:8321 + api_key: xyzzy +user_data_collection: + feedback_enabled: true + feedback_storage: "/tmp/data/feedback" + transcripts_enabled: true + transcripts_storage: "/tmp/data/transcripts" +conversation_cache: + type: "sqlite" + sqlite: + db_path: "/tmp/data/conversation-cache.db" +authentication: + module: "rh-identity" + rh_identity_config: + required_entitlements: ["rhel"] diff --git a/tests/e2e-prow/rhoai/configs/lightspeed-stack-invalid-feedback-storage.yaml b/tests/e2e-prow/rhoai/configs/lightspeed-stack-invalid-feedback-storage.yaml new file mode 100644 index 000000000..eb6ba2054 --- /dev/null +++ b/tests/e2e-prow/rhoai/configs/lightspeed-stack-invalid-feedback-storage.yaml @@ -0,0 +1,25 @@ +name: Lightspeed Core Service (LCS) +service: + host: 0.0.0.0 + port: 8080 + auth_enabled: false + workers: 1 + color_log: true + access_log: true +llama_stack: + # Uses a remote llama-stack service + # The instance would have already been started with a llama-stack-run.yaml file + use_as_library_client: false + # Alternative for "as library use" + # use_as_library_client: true + # library_client_config_path: + url: http://${env.E2E_LLAMA_HOSTNAME}:8321 + api_key: xyzzy +user_data_collection: + feedback_enabled: true + feedback_storage: "/invalid" + transcripts_enabled: true + transcripts_storage: "/tmp/data/transcripts" + +authentication: + module: "noop-with-token" diff --git a/tests/e2e-prow/rhoai/configs/lightspeed-stack-no-cache.yaml b/tests/e2e-prow/rhoai/configs/lightspeed-stack-no-cache.yaml new file mode 100644 index 000000000..6c8f31438 --- /dev/null +++ b/tests/e2e-prow/rhoai/configs/lightspeed-stack-no-cache.yaml @@ -0,0 +1,27 @@ +name: Lightspeed Core Service (LCS) +service: + host: 0.0.0.0 + port: 8080 + auth_enabled: false + workers: 1 + color_log: true + access_log: true +llama_stack: + # Uses a remote llama-stack service + # The instance would have already been started with a llama-stack-run.yaml file + use_as_library_client: false + # Alternative for "as library use" + # use_as_library_client: true + # library_client_config_path: + url: http://${env.E2E_LLAMA_HOSTNAME}:8321 + api_key: xyzzy +user_data_collection: + feedback_enabled: true + feedback_storage: "/tmp/data/feedback" + transcripts_enabled: true + transcripts_storage: "/tmp/data/transcripts" + +# NO conversation_cache configured - for testing error handling + +authentication: + module: "noop-with-token" diff --git a/tests/e2e-prow/rhoai/configs/lightspeed-stack-rbac.yaml b/tests/e2e-prow/rhoai/configs/lightspeed-stack-rbac.yaml new file mode 100644 index 000000000..e2e4bfa16 --- /dev/null +++ b/tests/e2e-prow/rhoai/configs/lightspeed-stack-rbac.yaml @@ -0,0 +1,94 @@ +name: Lightspeed Core Service (RBAC E2E Tests) +service: + host: 0.0.0.0 + port: 8080 + auth_enabled: true + workers: 1 + color_log: true + access_log: true + +llama_stack: + use_as_library_client: false + url: http://${env.E2E_LLAMA_HOSTNAME}:8321 + api_key: xyzzy + +user_data_collection: + feedback_enabled: true + feedback_storage: "/tmp/data/feedback" + transcripts_enabled: true + transcripts_storage: "/tmp/data/transcripts" + +# Conversation cache for storing Q&A history +conversation_cache: + type: "sqlite" + sqlite: + db_path: "/tmp/data/conversation-cache.db" + +# JWK token authentication with role extraction +authentication: + module: "jwk-token" + jwk_config: + url: "http://mock-jwks:8000/.well-known/jwks.json" + jwt_configuration: + user_id_claim: "sub" + username_claim: "name" + # Role rules: extract roles from JWT claims + role_rules: + # Grant 'admin' role to users with admin=true in JWT + - jsonpath: "$.admin" + operator: "equals" + value: [true] + roles: ["admin"] + # Grant 'user' role to users with role=user in JWT + - jsonpath: "$.role" + operator: "equals" + value: ["user"] + roles: ["user"] + # Grant 'viewer' role to users with role=viewer in JWT + - jsonpath: "$.role" + operator: "equals" + value: ["viewer"] + roles: ["viewer"] + # Grant 'query_only' role based on permissions array containing 'query' + - jsonpath: "$.permissions[*]" + operator: "contains" + value: "query" + roles: ["query_only"] + +# Authorization: map roles to actions +authorization: + access_rules: + # Admin role gets full access + - role: "admin" + actions: ["admin"] + # User role can query, access conversations, and provide feedback + - role: "user" + actions: + - "query" + - "streaming_query" + - "get_conversation" + - "list_conversations" + - "delete_conversation" + - "update_conversation" + - "feedback" + - "get_models" + - "get_tools" + - "info" + - "model_override" + # Viewer role can only read (no mutations) + - role: "viewer" + actions: + - "get_conversation" + - "list_conversations" + - "get_models" + - "get_tools" + - "info" + # Query-only role can only query (no model_override - must use defaults) + - role: "query_only" + actions: + - "query" + - "streaming_query" + # Everyone (*) role gets basic info access + - role: "*" + actions: + - "info" diff --git a/tests/e2e-prow/rhoai/configs/lightspeed-stack.yaml b/tests/e2e-prow/rhoai/configs/lightspeed-stack.yaml index cd667a4f0..b1fecbdbb 100644 --- a/tests/e2e-prow/rhoai/configs/lightspeed-stack.yaml +++ b/tests/e2e-prow/rhoai/configs/lightspeed-stack.yaml @@ -23,3 +23,21 @@ user_data_collection: authentication: module: "noop" + +mcp_servers: + # Mock server with client-provided auth - should appear in mcp-auth/client-options response + - name: "github-api" + provider_id: "model-context-protocol" + url: "http://mcp-mock-server:3000" + authorization_headers: + Authorization: "client" + # Mock server with client-provided auth (different header) - should appear in response + - name: "gitlab-api" + provider_id: "model-context-protocol" + url: "http://mcp-mock-server:3000" + authorization_headers: + X-API-Token: "client" + # Mock server with no auth - should NOT appear in response + - name: "public-api" + provider_id: "model-context-protocol" + url: "http://mcp-mock-server:3000" diff --git a/tests/e2e-prow/rhoai/configs/run.yaml b/tests/e2e-prow/rhoai/configs/run.yaml index 20f11f547..935ae206f 100644 --- a/tests/e2e-prow/rhoai/configs/run.yaml +++ b/tests/e2e-prow/rhoai/configs/run.yaml @@ -1,5 +1,5 @@ version: 2 -image_name: rhoai-configuration +image_name: starter apis: - agents @@ -12,24 +12,19 @@ apis: - scoring - tool_runtime - vector_io - + benchmarks: [] datasets: [] -# external_providers_dir: /opt/app-root/src/.llama/providers.d providers: inference: - provider_id: vllm provider_type: remote::vllm config: - url: ${env.KSVC_URL}/v1/ + base_url: ${env.KSVC_URL}/v1/ api_token: ${env.VLLM_API_KEY} tls_verify: false - max_tokens: 1024 - # - provider_id: openai - # provider_type: remote::openai - # config: - # api_key: ${env.OPENAI_API_KEY} + max_tokens: 512 - config: {} provider_id: sentence-transformers provider_type: inline::sentence-transformers @@ -38,24 +33,34 @@ providers: metadata_store: table_name: files_metadata backend: sql_default - storage_dir: ~/.llama/storage/files + storage_dir: /opt/app-root/src/.llama/storage/files provider_id: meta-reference-files provider_type: inline::localfs safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: - excluded_categories: [] + - config: + excluded_categories: [] + provider_id: llama-guard + provider_type: inline::llama-guard scoring: - - config: {} - provider_id: basic + - provider_id: basic provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: + openai_api_key: '********' tool_runtime: - - config: {} + - config: {} # Enable the RAG tool provider_id: rag-runtime provider_type: inline::rag-runtime + - config: {} # Enable MCP (Model Context Protocol) support + provider_id: model-context-protocol + provider_type: remote::model-context-protocol vector_io: - - config: + - config: # Define the storage backend for RAG persistence: namespace: vector_io::faiss backend: kv_rag @@ -104,12 +109,15 @@ server: port: 8321 storage: backends: - kv_default: + kv_default: # Single database for registry AND RAG data type: kv_sqlite - db_path: ${env.KV_STORE_PATH:=~/.llama/storage/rag/kv_store.db} + db_path: /opt/app-root/src/.llama/storage/rag/kv_store.db + kv_rag: + type: kv_sqlite + db_path: /opt/app-root/src/.llama/storage/rag/kv_store.db sql_default: type: sql_sqlite - db_path: ${env.SQL_STORE_PATH:=~/.llama/storage/sql_store.db} + db_path: ${env.SQL_STORE_PATH:=/opt/app-root/src/.llama/storage/sql_store.db} stores: metadata: namespace: registry @@ -127,7 +135,7 @@ storage: backend: kv_default registered_resources: models: - - model_id: meta-llama/Llama-3.2-1B-Instruct + - model_id: meta-llama/Llama-3.1-8B-Instruct provider_id: vllm model_type: llm provider_model_id: null @@ -141,20 +149,20 @@ registered_resources: - embedding_dimension: 768 embedding_model: sentence-transformers/all-mpnet-base-v2 provider_id: faiss - vector_store_id: vs_8c94967b-81cc-4028-a294-9cfac6fd9ae2 #TODO: parse this from the rag db + vector_store_id: ${env.FAISS_VECTOR_STORE_ID} shields: - shield_id: llama-guard provider_id: llama-guard - provider_shield_id: vllm/meta-llama/Llama-3.2-1B-Instruct + provider_shield_id: vllm/meta-llama/Llama-3.1-8B-Instruct datasets: [] scoring_fns: [] benchmarks: [] tool_groups: - - toolgroup_id: builtin::rag - provider_id: rag-runtime + - provider_id: rag-runtime + toolgroup_id: builtin::rag vector_stores: default_provider_id: faiss - default_embedding_model: + default_embedding_model: # Define the default embedding model for RAG provider_id: sentence-transformers model_id: all-mpnet-base-v2 safety: diff --git a/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack.yaml b/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack.yaml index 005f96978..31d934e28 100644 --- a/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack.yaml +++ b/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack.yaml @@ -6,8 +6,25 @@ metadata: spec: imagePullSecrets: - name: quay-lightspeed-pull-secret + initContainers: + - name: setup-rag-data + image: busybox:latest + command: + - /bin/sh + - -c + - | + mkdir -p /data/storage/rag + gunzip -c /rag-data/kv_store.db.gz > /data/storage/rag/kv_store.db + echo "RAG data extracted successfully" + ls -la /data/storage/rag/ + volumeMounts: + - name: app-root + mountPath: /data + - name: rag-data + mountPath: /rag-data containers: - name: llama-stack-container + command: ["llama", "stack", "run", "/opt/app-root/run.yaml"] env: - name: KSVC_URL valueFrom: @@ -19,6 +36,13 @@ spec: secretKeyRef: name: vllm-api-key-secret key: key + - name: INFERENCE_MODEL + value: "meta-llama/Llama-3.1-8B-Instruct" + - name: FAISS_VECTOR_STORE_ID + valueFrom: + secretKeyRef: + name: faiss-vector-store-secret + key: id image: ${LLAMA_STACK_IMAGE} ports: - containerPort: 8321 @@ -34,3 +58,6 @@ spec: - name: config configMap: name: llama-stack-config + - name: rag-data + configMap: + name: rag-data diff --git a/tests/e2e-prow/rhoai/manifests/lightspeed/mcp-mock-server.yaml b/tests/e2e-prow/rhoai/manifests/lightspeed/mcp-mock-server.yaml new file mode 100644 index 000000000..a8e236f68 --- /dev/null +++ b/tests/e2e-prow/rhoai/manifests/lightspeed/mcp-mock-server.yaml @@ -0,0 +1,50 @@ +apiVersion: v1 +kind: Pod +metadata: + name: mcp-mock-server + namespace: e2e-rhoai-dsc + labels: + app: mcp-mock-server +spec: + containers: + - name: mcp-mock-server + image: python:3.12-slim + # Run HTTP-only version of the mock server + command: ["python", "/app/server.py", "3000"] + env: + - name: MCP_HTTP_ONLY + value: "true" + ports: + - containerPort: 3000 + volumeMounts: + - name: server-script + mountPath: /app/server.py + subPath: server.py + readinessProbe: + httpGet: + path: / + port: 3000 + initialDelaySeconds: 5 + periodSeconds: 5 + livenessProbe: + httpGet: + path: / + port: 3000 + initialDelaySeconds: 10 + periodSeconds: 10 + volumes: + - name: server-script + configMap: + name: mcp-mock-server-script +--- +apiVersion: v1 +kind: Service +metadata: + name: mcp-mock-server + namespace: e2e-rhoai-dsc +spec: + selector: + app: mcp-mock-server + ports: + - port: 3000 + targetPort: 3000 diff --git a/tests/e2e-prow/rhoai/manifests/lightspeed/mock-jwks.yaml b/tests/e2e-prow/rhoai/manifests/lightspeed/mock-jwks.yaml new file mode 100644 index 000000000..b1555c730 --- /dev/null +++ b/tests/e2e-prow/rhoai/manifests/lightspeed/mock-jwks.yaml @@ -0,0 +1,46 @@ +apiVersion: v1 +kind: Pod +metadata: + name: mock-jwks + namespace: e2e-rhoai-dsc + labels: + app: mock-jwks +spec: + containers: + - name: mock-jwks + image: python:3.12-slim + command: ["python", "/app/server.py"] + ports: + - containerPort: 8000 + volumeMounts: + - name: server-script + mountPath: /app/server.py + subPath: server.py + readinessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 2 + periodSeconds: 5 + livenessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 5 + periodSeconds: 10 + volumes: + - name: server-script + configMap: + name: mock-jwks-script +--- +apiVersion: v1 +kind: Service +metadata: + name: mock-jwks + namespace: e2e-rhoai-dsc +spec: + selector: + app: mock-jwks + ports: + - port: 8000 + targetPort: 8000 diff --git a/tests/e2e-prow/rhoai/manifests/test-pod/spin-up.yaml b/tests/e2e-prow/rhoai/manifests/test-pod/spin-up.yaml deleted file mode 100644 index f11778c0d..000000000 --- a/tests/e2e-prow/rhoai/manifests/test-pod/spin-up.yaml +++ /dev/null @@ -1,30 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: test-pod - namespace: e2e-rhoai-dsc -spec: - containers: - - name: test-container - env: - - name: E2E_LSC_HOSTNAME - valueFrom: - secretKeyRef: - name: lcs-ip-secret - key: key - - name: E2E_LLAMA_HOSTNAME - valueFrom: - secretKeyRef: - name: llama-stack-ip-secret - key: key - image: registry.access.redhat.com/ubi9/python-312 - command: ["/bin/sh", "/scripts/run-tests.sh"] - volumeMounts: - - name: script-volume - mountPath: /scripts - volumes: - - name: script-volume - configMap: - name: test-script-cm - defaultMode: 0755 # Make the script executable - restartPolicy: Never \ No newline at end of file diff --git a/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-cpu.yaml b/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-cpu.yaml index 4c81d6b01..4c3f5e7bd 100644 --- a/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-cpu.yaml +++ b/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-cpu.yaml @@ -13,7 +13,7 @@ spec: containers: - args: # - /mnt/models/ - - meta-llama/Llama-3.2-1B-Instruct + - meta-llama/Llama-3.1-8B-Instruct - --enable-auto-tool-choice - --tool-call-parser - llama3_json diff --git a/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-gpu.yaml b/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-gpu.yaml index 2027cfcf2..b7597991c 100644 --- a/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-gpu.yaml +++ b/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-gpu.yaml @@ -13,12 +13,12 @@ spec: containers: - args: - --model - - meta-llama/Llama-3.2-1B-Instruct + - meta-llama/Llama-3.1-8B-Instruct - --enable-auto-tool-choice - --tool-call-parser - llama3_json - --chat-template - - /mnt/chat-template/tool_chat_template_llama3.2_json.jinja + - /mnt/chat-template/tool_chat_template_llama3.1_json.jinja - --download-dir - /tmp/models-cache - --port diff --git a/tests/e2e-prow/rhoai/pipeline-services.sh b/tests/e2e-prow/rhoai/pipeline-services.sh index 8d011bce7..cd33ab9d5 100755 --- a/tests/e2e-prow/rhoai/pipeline-services.sh +++ b/tests/e2e-prow/rhoai/pipeline-services.sh @@ -2,10 +2,11 @@ BASE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# Deploy llama-stack envsubst < "$BASE_DIR/manifests/lightspeed/llama-stack.yaml" | oc apply -f - oc wait pod/llama-stack-service \ --n e2e-rhoai-dsc --for=condition=Ready --timeout=600s + -n e2e-rhoai-dsc --for=condition=Ready --timeout=600s # Get url address of llama-stack pod oc label pod llama-stack-service pod=llama-stack-service -n e2e-rhoai-dsc @@ -22,4 +23,5 @@ oc create secret generic llama-stack-ip-secret \ --from-literal=key="$E2E_LLAMA_HOSTNAME" \ -n e2e-rhoai-dsc || echo "Secret exists" -oc apply -f "$BASE_DIR/manifests/lightspeed/lightspeed-stack.yaml" \ No newline at end of file +# Deploy lightspeed-stack +oc apply -f "$BASE_DIR/manifests/lightspeed/lightspeed-stack.yaml" diff --git a/tests/e2e-prow/rhoai/pipeline.sh b/tests/e2e-prow/rhoai/pipeline.sh index 718dc36ae..6e66ff94f 100755 --- a/tests/e2e-prow/rhoai/pipeline.sh +++ b/tests/e2e-prow/rhoai/pipeline.sh @@ -2,32 +2,29 @@ set -euo pipefail trap 'echo "❌ Pipeline failed at line $LINENO"; exit 1' ERR +# Signal to e2e tests that we're running in Prow/OpenShift +export RUNNING_PROW=true #======================================== # 1. GLOBAL CONFIG #======================================== NAMESPACE="e2e-rhoai-dsc" -MODEL_NAME="meta-llama/Llama-3.2-1B-Instruct" +MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct" PIPELINE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -# Get llama-stack image from GitHub Containerfile -echo "Fetching llama-stack image from GitHub..." -LLAMA_STACK_IMAGE=$(curl -sL https://raw.githubusercontent.com/lightspeed-core/lightspeed-stack/main/test.containerfile | grep -m1 '^FROM' | awk '{print $2}') -if [ -z "$LLAMA_STACK_IMAGE" ]; then - echo "❌ Failed to fetch llama-stack image from GitHub" - exit 1 -fi -echo " -> Found llama-stack image: $LLAMA_STACK_IMAGE" +# RHOAI llama-stack image +LLAMA_STACK_IMAGE="${LLAMA_STACK_IMAGE:-quay.io/rhoai/odh-llama-stack-core-rhel9:rhoai-3.3}" +echo "Using llama-stack image: $LLAMA_STACK_IMAGE" export LLAMA_STACK_IMAGE #======================================== # 2. ENVIRONMENT SETUP #======================================== echo "===== Setting up environment variables =====" -export HUGGING_FACE_HUB_TOKEN=$(cat /var/run/huggingface/hf-token-ces-lcore-test || true) -export VLLM_API_KEY=$(cat /var/run/vllm/vllm-api-key-lcore-test || true) -export QUAY_ROBOT_NAME=$(cat /var/run/quay-aipcc-name/lcore-quay-name-lcore-test || true) -export QUAY_ROBOT_PASSWORD=$(cat /var/run/quay-aipcc-password/lcore-quay-password-lcore-test || true) +#export HUGGING_FACE_HUB_TOKEN=$(cat /var/run/huggingface/hf-token-ces-lcore-test || true) +#export VLLM_API_KEY=$(cat /var/run/vllm/vllm-api-key-lcore-test || true) +#export QUAY_ROBOT_NAME=$(cat /var/run/quay-aipcc-name/lcore-quay-name-lcore-test || true) +#export QUAY_ROBOT_PASSWORD=$(cat /var/run/quay-aipcc-password/lcore-quay-password-lcore-test || true) [[ -n "$HUGGING_FACE_HUB_TOKEN" ]] && echo "✅ HUGGING_FACE_HUB_TOKEN is set" || { echo "❌ Missing HUGGING_FACE_HUB_TOKEN"; exit 1; } @@ -77,12 +74,12 @@ oc secrets link default quay-lightspeed-pull-secret --for=pull -n "$NAMESPACE" 2 #======================================== echo "===== Setting up configmaps =====" -curl -sL -o tool_chat_template_llama3.2_json.jinja \ - https://raw.githubusercontent.com/vllm-project/vllm/main/examples/tool_chat_template_llama3.2_json.jinja \ +curl -sL -o tool_chat_template_llama3.1_json.jinja \ + https://raw.githubusercontent.com/vllm-project/vllm/main/examples/tool_chat_template_llama3.1_json.jinja \ || { echo "❌ Failed to download jinja template"; exit 1; } oc create configmap vllm-chat-template -n "$NAMESPACE" \ - --from-file=tool_chat_template_llama3.2_json.jinja --dry-run=client -o yaml | oc apply -f - + --from-file=tool_chat_template_llama3.1_json.jinja --dry-run=client -o yaml | oc apply -f - #======================================== @@ -97,7 +94,7 @@ oc get pods -n "$NAMESPACE" # 6. WAIT FOR POD & TEST API #======================================== source pod.env -oc wait --for=condition=Ready pod/$POD_NAME -n $NAMESPACE --timeout=300s +oc wait --for=condition=Ready pod/$POD_NAME -n $NAMESPACE --timeout=600s echo "===== Testing vLLM endpoint =====" start_time=$(date +%s) @@ -154,14 +151,87 @@ oc delete pod vllm-test-curl -n "$NAMESPACE" --ignore-not-found=true #======================================== -# 7. DEPLOY LIGHTSPEED STACK AND LLAMA STACK +# 7. DEPLOY MOCK SERVERS (JWKS & MCP) +#======================================== +echo "===== Deploying Mock Servers =====" + +# Navigate to repo root to access server scripts +REPO_ROOT="$(cd "$PIPELINE_DIR/../../.." && pwd)" + +# Create ConfigMaps from server scripts +echo "Creating mock server ConfigMaps..." +oc create configmap mock-jwks-script -n "$NAMESPACE" \ + --from-file=server.py="$REPO_ROOT/tests/e2e/mock_jwks_server/server.py" \ + --dry-run=client -o yaml | oc apply -f - + +oc create configmap mcp-mock-server-script -n "$NAMESPACE" \ + --from-file=server.py="$REPO_ROOT/dev-tools/mcp-mock-server/server.py" \ + --dry-run=client -o yaml | oc apply -f - + +# Deploy mock server pods and services +echo "Deploying mock-jwks..." +oc apply -f "$PIPELINE_DIR/manifests/lightspeed/mock-jwks.yaml" + +echo "Deploying mcp-mock-server..." +oc apply -f "$PIPELINE_DIR/manifests/lightspeed/mcp-mock-server.yaml" + +# Wait for mock servers to be ready +echo "Waiting for mock servers to be ready..." +oc wait pod/mock-jwks pod/mcp-mock-server \ + -n "$NAMESPACE" --for=condition=Ready --timeout=120s || { + echo "⚠️ Mock servers not ready, checking status..." + oc get pods -n "$NAMESPACE" | grep -E "mock-jwks|mcp-mock" || true + oc describe pod mock-jwks -n "$NAMESPACE" 2>/dev/null | tail -20 || true + oc describe pod mcp-mock-server -n "$NAMESPACE" 2>/dev/null | tail -20 || true +} +echo "✅ Mock servers deployed" + +#======================================== +# 8. DEPLOY LIGHTSPEED STACK AND LLAMA STACK #======================================== echo "===== Deploying Services =====" create_secret api-url-secret --from-literal=key="$KSVC_URL" oc create configmap llama-stack-config -n "$NAMESPACE" --from-file=configs/run.yaml oc create configmap lightspeed-stack-config -n "$NAMESPACE" --from-file=configs/lightspeed-stack.yaml -oc create configmap test-script-cm -n "$NAMESPACE" --from-file=run-tests.sh + +# Create RAG data ConfigMap from the e2e test RAG data +echo "Creating RAG data ConfigMap..." +RAG_DB_PATH="$REPO_ROOT/tests/e2e/rag/kv_store.db" +if [ -f "$RAG_DB_PATH" ]; then + # Extract vector store ID from kv_store.db using Python (sqlite3 CLI may not be available) + echo "Extracting vector store ID from kv_store.db..." + # Key format is: vector_stores:v3::vs_xxx or openai_vector_stores:v3::vs_xxx + export FAISS_VECTOR_STORE_ID=$(python3 -c " +import sqlite3 +import re +conn = sqlite3.connect('$RAG_DB_PATH') +cursor = conn.cursor() +cursor.execute(\"SELECT key FROM kvstore WHERE key LIKE 'vector_stores:v%::%' LIMIT 1\") +row = cursor.fetchone() +if row: + # Extract the vs_xxx ID from the key + match = re.search(r'(vs_[a-f0-9-]+)', row[0]) + if match: + print(match.group(1)) +conn.close() +" 2>/dev/null || echo "") + + if [ -n "$FAISS_VECTOR_STORE_ID" ]; then + echo "✅ Extracted FAISS_VECTOR_STORE_ID: $FAISS_VECTOR_STORE_ID" + # Create secret for llama-stack to use + create_secret faiss-vector-store-secret --from-literal=id="$FAISS_VECTOR_STORE_ID" + else + echo "❌ No vector_store found in $RAG_DB_PATH - FAISS tests will fail!" + fi + + gzip -c "$RAG_DB_PATH" > /tmp/kv_store.db.gz + oc create configmap rag-data -n "$NAMESPACE" --from-file=kv_store.db.gz=/tmp/kv_store.db.gz + rm /tmp/kv_store.db.gz + echo "✅ RAG data ConfigMap created from $RAG_DB_PATH" +else + echo "⚠️ No kv_store.db found at $RAG_DB_PATH" +fi ./pipeline-services.sh @@ -191,7 +261,6 @@ if ! oc wait pod/lightspeed-stack-service pod/llama-stack-service \ exit 1 fi echo "✅ Both service pods are ready" -sleep 30 oc get pods -n "$NAMESPACE" @@ -205,7 +274,7 @@ oc describe pod llama-stack-service -n "$NAMESPACE" || true #======================================== -# 8. EXTRACT LCS IP & STORE +# 9. EXPOSE SERVICE & START PORT-FORWARD #======================================== oc label pod lightspeed-stack-service pod=lightspeed-stack-service -n $NAMESPACE @@ -215,54 +284,52 @@ oc expose pod lightspeed-stack-service \ --type=ClusterIP \ -n $NAMESPACE -E2E_LSC_HOSTNAME="lightspeed-stack-service-svc.$NAMESPACE.svc.cluster.local" -echo "LCS IP: $E2E_LSC_HOSTNAME" +# Kill any existing processes on ports 8080 and 8000 +echo "Checking for existing processes on ports 8080 and 8000..." +lsof -ti:8080 | xargs kill -9 2>/dev/null || true +lsof -ti:8000 | xargs kill -9 2>/dev/null || true -create_secret lcs-ip-secret --from-literal=key="$E2E_LSC_HOSTNAME" +# Start port-forward for lightspeed-stack +echo "Starting port-forward for lightspeed-stack..." +oc port-forward svc/lightspeed-stack-service-svc 8080:8080 -n $NAMESPACE & +PF_LCS_PID=$! +# Start port-forward for mock-jwks (needed for RBAC tests to get tokens) +echo "Starting port-forward for mock-jwks..." +oc port-forward svc/mock-jwks 8000:8000 -n $NAMESPACE & +PF_JWKS_PID=$! -#======================================== -# 9. LOGGING & TEST EXECUTION -#======================================== -echo "===== Running test pod =====" -./pipeline-test-pod.sh +sleep 10 -sleep 20 -oc get pods -n "$NAMESPACE" +export E2E_LSC_HOSTNAME="localhost" +export E2E_JWKS_HOSTNAME="localhost" +echo "LCS accessible at: http://$E2E_LSC_HOSTNAME:8080" +echo "Mock JWKS accessible at: http://$E2E_JWKS_HOSTNAME:8000" -# Wait until tests are complete -oc wait --for=condition=Ready=True pod/test-pod -n $NAMESPACE --timeout=900s || oc wait --for=condition=Ready=False pod/test-pod -n $NAMESPACE --timeout=60s -start_time=$(date +%s) -timeout=2400 -while true; do - sleep 120 - - PHASE=$(oc get pod test-pod -n $NAMESPACE -o jsonpath='{.status.phase}') - echo "Current phase test-pod: $PHASE" - if [[ "$PHASE" == "Succeeded" || "$PHASE" == "Failed" ]]; then - break - fi - - current_time=$(date +%s) - elapsed=$((current_time - start_time)) - if (( elapsed >= timeout )); then - echo "⏰ Timeout reached ($timeout seconds). Stopping test." - exit 1 - fi +#======================================== +# 10. RUN TESTS +#======================================== +echo "===== Running E2E tests =====" - oc get pods -n "$NAMESPACE" -done -oc logs test-pod -n $NAMESPACE || oc describe pod test-pod -n $NAMESPACE || true +# Ensure run-tests.sh is executable +chmod +x ./run-tests.sh +# Disable exit on error to capture test exit code +set +e +./run-tests.sh +TEST_EXIT_CODE=$? +set -e -TEST_EXIT_CODE=$(oc get pod test-pod -n $NAMESPACE -o jsonpath='{.status.containerStatuses[0].state.terminated.exitCode}') +# Cleanup port-forwards +kill $PF_LCS_PID 2>/dev/null || true +kill $PF_JWKS_PID 2>/dev/null || true echo "===== E2E COMPLETE =====" -if [ "${TEST_EXIT_CODE:-2}" -ne 0 ]; then - echo "❌ E2E tests failed with exit code $TEST_EXIT_CODE (pod/test-pod failed)" +if [ "${TEST_EXIT_CODE:-1}" -ne 0 ]; then + echo "❌ E2E tests failed with exit code $TEST_EXIT_CODE" else echo "✅ E2E tests succeeded" fi diff --git a/tests/e2e-prow/rhoai/run-tests.sh b/tests/e2e-prow/rhoai/run-tests.sh old mode 100644 new mode 100755 index 657b8124b..41aab3441 --- a/tests/e2e-prow/rhoai/run-tests.sh +++ b/tests/e2e-prow/rhoai/run-tests.sh @@ -1,17 +1,37 @@ -git clone https://github.com/lightspeed-core/lightspeed-stack.git -cd lightspeed-stack +#!/bin/bash +set -e -echo "pod started" -echo $E2E_LSC_HOSTNAME +# Go to repo root (run-tests.sh is in tests/e2e-prow/rhoai/) +cd "$(dirname "$0")/../../.." -curl -f http://$E2E_LSC_HOSTNAME:8080/v1/models || { - echo "❌ Basic connectivity failed - showing logs before running full tests" +# FAISS_VECTOR_STORE_ID should be exported by pipeline.sh +if [ -z "$FAISS_VECTOR_STORE_ID" ]; then + echo "❌ FAISS_VECTOR_STORE_ID is not set - should be exported by pipeline.sh" exit 1 -} +fi + +echo "Running tests from: $(pwd)" +echo "E2E_LSC_HOSTNAME: $E2E_LSC_HOSTNAME" +echo "FAISS_VECTOR_STORE_ID: $FAISS_VECTOR_STORE_ID" + +# Wait for service to be ready (retry up to 60 seconds) +echo "Waiting for service to be ready..." +for i in $(seq 1 12); do + if curl -sf http://$E2E_LSC_HOSTNAME:8080/v1/models > /dev/null 2>&1; then + echo "✅ Service is responding" + break + fi + if [ $i -eq 12 ]; then + echo "❌ Basic connectivity failed after 60 seconds" + exit 1 + fi + echo " Attempt $i/12 - service not ready, waiting 5s..." + sleep 5 +done echo "Installing test dependencies..." pip install uv uv sync -echo "Running comprehensive e2e test suite..." -make test-e2e \ No newline at end of file +echo "Running e2e test suite..." +make test-e2e diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh new file mode 100755 index 000000000..0beb9e28c --- /dev/null +++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh @@ -0,0 +1,234 @@ +#!/bin/bash +# Consolidated E2E operations script for OpenShift/Prow environment +# Usage: e2e-ops.sh [args...] +# +# Commands: +# restart-lightspeed - Restart lightspeed-stack pod and port-forward +# restart-llama-stack - Restart/restore llama-stack pod +# restart-port-forward - Re-establish port-forward for lightspeed +# wait-for-pod [attempts] - Wait for a pod to be ready +# update-configmap - Update ConfigMap from file +# get-configmap-content - Get ConfigMap content (outputs to stdout) +# disrupt-llama-stack - Delete llama-stack pod to disrupt connection + +set -e + +NAMESPACE="${NAMESPACE:-e2e-rhoai-dsc}" +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +MANIFEST_DIR="$SCRIPT_DIR/../manifests/lightspeed" + +# ============================================================================ +# Helper functions +# ============================================================================ + +wait_for_pod() { + local pod_name="$1" + local max_attempts="${2:-24}" + + for ((attempt=1; attempt<=max_attempts; attempt++)); do + local ready + ready=$(oc get pod "$pod_name" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[0].ready}' 2>/dev/null || echo "false") + if [[ "$ready" == "true" ]]; then + echo "✓ Pod $pod_name ready" + return 0 + fi + sleep 3 + done + + echo "Pod $pod_name not ready after $((max_attempts * 3))s" + return 1 +} + +verify_connectivity() { + local max_attempts="${1:-6}" + local local_port="${LOCAL_PORT:-8080}" + local http_code="" + + for ((attempt=1; attempt<=max_attempts; attempt++)); do + # Check readiness endpoint - accept 200 or 401 (auth required but service is up) + http_code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 5 "http://localhost:$local_port/readiness" 2>/dev/null) || http_code="000" + + if [[ "$http_code" == "200" || "$http_code" == "401" ]]; then + return 0 + fi + + if [[ $attempt -lt $max_attempts ]]; then + sleep 2 + fi + done + + echo "Connectivity check failed (HTTP: ${http_code:-unknown})" + return 1 +} + +# ============================================================================ +# Command implementations +# ============================================================================ + +cmd_restart_lightspeed() { + echo "Restarting lightspeed-stack service..." + + # Delete existing pod with timeout + timeout 60 oc delete pod lightspeed-stack-service -n "$NAMESPACE" --ignore-not-found=true --wait=true || { + oc delete pod lightspeed-stack-service -n "$NAMESPACE" --ignore-not-found=true --force --grace-period=0 2>/dev/null || true + sleep 2 + } + + # Apply manifest + oc apply -f "$MANIFEST_DIR/lightspeed-stack.yaml" + + # Wait for pod to be ready + wait_for_pod "lightspeed-stack-service" 20 + + # Re-label pod for service discovery + oc label pod lightspeed-stack-service pod=lightspeed-stack-service -n "$NAMESPACE" --overwrite + + # Re-establish port-forward + cmd_restart_port_forward + + echo "✓ Lightspeed restart complete" +} + +cmd_restart_llama_stack() { + echo "===== Restoring llama-stack service =====" + + # Apply manifest (creates pod if not exists) + # Use envsubst to expand ${LLAMA_STACK_IMAGE} and other env vars + echo "Applying pod manifest..." + envsubst < "$MANIFEST_DIR/llama-stack.yaml" | oc apply -f - + + # Wait for pod to be ready + wait_for_pod "llama-stack-service" 24 + + # Re-label pod for service discovery + echo "Labeling pod for service..." + oc label pod llama-stack-service pod=llama-stack-service -n "$NAMESPACE" --overwrite + + echo "===== Llama-stack restore complete =====" +} + +cmd_restart_port_forward() { + local local_port="${LOCAL_PORT:-8080}" + local remote_port="${REMOTE_PORT:-8080}" + local max_attempts=3 + + echo "Re-establishing port-forward on $local_port:$remote_port..." + + for ((attempt=1; attempt<=max_attempts; attempt++)); do + # Kill existing port-forward processes + pkill -9 -f "oc port-forward.*lightspeed" 2>/dev/null || true + sleep 1 + + # Start new port-forward in background + nohup oc port-forward svc/lightspeed-stack-service-svc "$local_port:$remote_port" -n "$NAMESPACE" > /tmp/port-forward.log 2>&1 & + local pf_pid=$! + disown $pf_pid 2>/dev/null || true + sleep 5 + + # Verify connectivity (more attempts for larger models) + if verify_connectivity 10; then + echo "✓ Port-forward established (PID: $pf_pid)" + return 0 + fi + + if [[ $attempt -lt $max_attempts ]]; then + echo "Attempt $attempt failed, retrying..." + sleep 3 + fi + done + + echo "Failed to establish port-forward" + cat /tmp/port-forward.log 2>/dev/null | tail -5 || true + return 1 +} + +cmd_wait_for_pod() { + local pod_name="${1:?Pod name required}" + local max_attempts="${2:-24}" + wait_for_pod "$pod_name" "$max_attempts" +} + +cmd_update_configmap() { + local configmap_name="${1:?ConfigMap name required}" + local source_file="${2:?Source file required}" + + echo "Updating ConfigMap $configmap_name from $source_file..." + + # Delete existing configmap + oc delete configmap "$configmap_name" -n "$NAMESPACE" --ignore-not-found=true + + # Create new configmap from the source file + oc create configmap "$configmap_name" -n "$NAMESPACE" \ + --from-file="lightspeed-stack.yaml=$source_file" + + echo "✓ ConfigMap $configmap_name updated successfully" +} + +cmd_get_configmap_content() { + local configmap_name="${1:?ConfigMap name required}" + oc get configmap "$configmap_name" -n "$NAMESPACE" \ + -o 'jsonpath={.data.lightspeed-stack\.yaml}' +} + +cmd_disrupt_llama_stack() { + local pod_name="llama-stack-service" + + # Check if pod exists and is running + local phase + phase=$(oc get pod "$pod_name" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound") + + if [[ "$phase" == "Running" ]]; then + # Delete the pod to disrupt connection + oc delete pod "$pod_name" -n "$NAMESPACE" --wait=true + sleep 2 + echo "Llama Stack connection disrupted successfully (pod deleted)" + exit 0 + else + echo "Llama Stack pod was not running (phase: $phase)" + exit 2 + fi +} + +# ============================================================================ +# Main command dispatcher +# ============================================================================ + +COMMAND="${1:-}" +shift || true + +case "$COMMAND" in + restart-lightspeed) + cmd_restart_lightspeed + ;; + restart-llama-stack) + cmd_restart_llama_stack + ;; + restart-port-forward) + cmd_restart_port_forward + ;; + wait-for-pod) + cmd_wait_for_pod "$@" + ;; + update-configmap) + cmd_update_configmap "$@" + ;; + get-configmap-content) + cmd_get_configmap_content "$@" + ;; + disrupt-llama-stack) + cmd_disrupt_llama_stack + ;; + *) + echo "Usage: $0 [args...]" + echo "" + echo "Commands:" + echo " restart-lightspeed - Restart lightspeed-stack pod and port-forward" + echo " restart-llama-stack - Restart/restore llama-stack pod" + echo " restart-port-forward - Re-establish port-forward for lightspeed" + echo " wait-for-pod [attempts] - Wait for a pod to be ready" + echo " update-configmap - Update ConfigMap from file" + echo " get-configmap-content - Get ConfigMap content (outputs to stdout)" + echo " disrupt-llama-stack - Delete llama-stack pod to disrupt connection" + exit 1 + ;; +esac diff --git a/tests/e2e/features/environment.py b/tests/e2e/features/environment.py index 3d6a4fdae..b731ca743 100644 --- a/tests/e2e/features/environment.py +++ b/tests/e2e/features/environment.py @@ -13,10 +13,12 @@ import requests from behave.model import Feature, Scenario +from tests.e2e.utils.prow_utils import restore_llama_stack_pod from behave.runner import Context from tests.e2e.utils.utils import ( create_config_backup, + is_prow_environment, remove_config_backup, restart_container, switch_config, @@ -25,6 +27,38 @@ FALLBACK_MODEL = "gpt-4o-mini" FALLBACK_PROVIDER = "openai" +# Config file mappings: config_name -> (docker_path, prow_path) +_CONFIG_PATHS = { + "no-cache": ( + "tests/e2e/configuration/{mode_dir}/lightspeed-stack-no-cache.yaml", + "tests/e2e-prow/rhoai/configs/lightspeed-stack-no-cache.yaml", + ), + "auth-noop-token": ( + "tests/e2e/configuration/{mode_dir}/lightspeed-stack-auth-noop-token.yaml", + "tests/e2e-prow/rhoai/configs/lightspeed-stack-auth-noop-token.yaml", + ), + "rbac": ( + "tests/e2e/configuration/{mode_dir}/lightspeed-stack-rbac.yaml", + "tests/e2e-prow/rhoai/configs/lightspeed-stack-rbac.yaml", + ), + "invalid-feedback-storage": ( + "tests/e2e/configuration/{mode_dir}/lightspeed-stack-invalid-feedback-storage.yaml", + "tests/e2e-prow/rhoai/configs/lightspeed-stack-invalid-feedback-storage.yaml", + ), + "rh-identity": ( + "tests/e2e/configuration/{mode_dir}/lightspeed-stack-auth-rh-identity.yaml", + "tests/e2e-prow/rhoai/configs/lightspeed-stack-auth-rh-identity.yaml", + ), +} + + +def _get_config_path(config_name: str, mode_dir: str) -> str: + """Get the appropriate config path based on environment.""" + docker_path_template, prow_path = _CONFIG_PATHS[config_name] + if is_prow_environment(): + return prow_path + return docker_path_template.format(mode_dir=mode_dir) + def _fetch_models_from_service() -> dict: """Query /v1/models endpoint and return first LLM model. @@ -138,11 +172,11 @@ def before_scenario(context: Context, scenario: Scenario) -> None: mode_dir = "library-mode" if context.is_library_mode else "server-mode" if "InvalidFeedbackStorageConfig" in scenario.effective_tags: - context.scenario_config = f"tests/e2e/configuration/{mode_dir}/lightspeed-stack-invalid-feedback-storage.yaml" + context.scenario_config = _get_config_path("invalid-feedback-storage", mode_dir) if "NoCacheConfig" in scenario.effective_tags: - context.scenario_config = ( - f"tests/e2e/configuration/{mode_dir}/lightspeed-stack-no-cache.yaml" - ) + context.scenario_config = _get_config_path("no-cache", mode_dir) + switch_config(context.scenario_config) + restart_container("lightspeed-stack") def after_scenario(context: Context, scenario: Scenario) -> None: @@ -171,63 +205,67 @@ def after_scenario(context: Context, scenario: Scenario) -> None: scenario-specific teardown actions to run (e.g., "InvalidFeedbackStorageConfig", "NoCacheConfig"). """ - if "InvalidFeedbackStorageConfig" in scenario.effective_tags: - switch_config(context.feature_config) - restart_container("lightspeed-stack") - if "NoCacheConfig" in scenario.effective_tags: + # Restore Llama Stack FIRST (before any lightspeed-stack restart) + llama_was_running = getattr(context, "llama_stack_was_running", False) + if llama_was_running: + _restore_llama_stack(context) + context.llama_stack_was_running = False + + # Tags that require config restoration after scenario + config_restore_tags = {"InvalidFeedbackStorageConfig", "NoCacheConfig"} + if config_restore_tags & set(scenario.effective_tags): switch_config(context.feature_config) restart_container("lightspeed-stack") - # Restore Llama Stack connection if it was disrupted (only in server mode) - if ( - not context.is_library_mode - and hasattr(context, "llama_stack_was_running") - and context.llama_stack_was_running - ): - try: - # Start the llama-stack container again - subprocess.run( - ["docker", "start", "llama-stack"], check=True, capture_output=True - ) - # Wait for the service to be healthy - print("Restoring Llama Stack connection...") - time.sleep(20) - - # Check if it's healthy - for attempt in range(6): # Try for 30 seconds - try: - result = subprocess.run( - [ - "docker", - "exec", - "llama-stack", - "curl", - "-f", - f"http://{context.hostname_llama}:{context.port_llama}/v1/health", - ], - capture_output=True, - timeout=5, - check=True, - ) - if result.returncode == 0: - print("✓ Llama Stack connection restored successfully") - break - except subprocess.TimeoutExpired: - print(f"⏱Health check timed out on attempt {attempt + 1}/6") - - if attempt < 5: - print( - f"Waiting for Llama Stack to be healthy... (attempt {attempt + 1}/6)" - ) - time.sleep(5) - else: - print( - "Warning: Llama Stack may not be fully healthy after restoration" - ) - - except subprocess.CalledProcessError as e: - print(f"Warning: Could not restore Llama Stack connection: {e}") +def _restore_llama_stack(context: Context) -> None: + """Restore Llama Stack connection after disruption.""" + if is_prow_environment(): + restore_llama_stack_pod() + return + + try: + # Start the llama-stack container again + subprocess.run( + ["docker", "start", "llama-stack"], check=True, capture_output=True + ) + + # Wait for the service to be healthy + print("Restoring Llama Stack connection...") + time.sleep(5) + + # Check if it's healthy + for attempt in range(6): # Try for 30 seconds + try: + result = subprocess.run( + [ + "docker", + "exec", + "llama-stack", + "curl", + "-f", + f"http://{context.hostname_llama}:{context.port_llama}/v1/health", + ], + capture_output=True, + timeout=5, + check=True, + ) + if result.returncode == 0: + print("✓ Llama Stack connection restored successfully") + break + except subprocess.TimeoutExpired: + print(f"⏱Health check timed out on attempt {attempt + 1}/6") + + if attempt < 5: + print( + f"Waiting for Llama Stack to be healthy... (attempt {attempt + 1}/6)" + ) + time.sleep(5) + else: + print("Warning: Llama Stack may not be fully healthy after restoration") + + except subprocess.CalledProcessError as e: + print(f"Warning: Could not restore Llama Stack connection: {e}") def before_feature(context: Context, feature: Feature) -> None: @@ -235,29 +273,21 @@ def before_feature(context: Context, feature: Feature) -> None: Prepare per-feature test environment and apply feature-specific configuration. """ + mode_dir = "library-mode" if context.is_library_mode else "server-mode" if "Authorized" in feature.tags: - mode_dir = "library-mode" if context.is_library_mode else "server-mode" - context.feature_config = ( - f"tests/e2e/configuration/{mode_dir}/lightspeed-stack-auth-noop-token.yaml" - ) + context.feature_config = _get_config_path("auth-noop-token", mode_dir) context.default_config_backup = create_config_backup("lightspeed-stack.yaml") switch_config(context.feature_config) restart_container("lightspeed-stack") if "RBAC" in feature.tags: - mode_dir = "library-mode" if context.is_library_mode else "server-mode" - context.feature_config = ( - f"tests/e2e/configuration/{mode_dir}/lightspeed-stack-rbac.yaml" - ) + context.feature_config = _get_config_path("rbac", mode_dir) context.default_config_backup = create_config_backup("lightspeed-stack.yaml") switch_config(context.feature_config) restart_container("lightspeed-stack") if "RHIdentity" in feature.tags: - mode_dir = "library-mode" if context.is_library_mode else "server-mode" - context.feature_config = ( - f"tests/e2e/configuration/{mode_dir}/lightspeed-stack-auth-rh-identity.yaml" - ) + context.feature_config = _get_config_path("rh-identity", mode_dir) context.default_config_backup = create_config_backup("lightspeed-stack.yaml") switch_config(context.feature_config) restart_container("lightspeed-stack") @@ -292,6 +322,5 @@ def after_feature(context: Context, feature: Feature) -> None: if "Feedback" in feature.tags: for conversation_id in context.feedback_conversations: url = f"http://{context.hostname}:{context.port}/v1/conversations/{conversation_id}" - headers = context.auth_headers if hasattr(context, "auth_headers") else {} - response = requests.delete(url, headers=headers) - assert response.status_code == 200, url + response = requests.delete(url, timeout=10) + assert response.status_code == 200, f"{url} returned {response.status_code}" diff --git a/tests/e2e/features/steps/health.py b/tests/e2e/features/steps/health.py index 06cd4bb9d..eefbeef04 100644 --- a/tests/e2e/features/steps/health.py +++ b/tests/e2e/features/steps/health.py @@ -5,6 +5,8 @@ from behave import given # pyright: ignore[reportAttributeAccessIssue] from behave.runner import Context +from tests.e2e.utils.utils import is_prow_environment + @given("The llama-stack connection is disrupted") def llama_stack_connection_broken(context: Context) -> None: @@ -27,6 +29,13 @@ def llama_stack_connection_broken(context: Context) -> None: # Store original state for restoration context.llama_stack_was_running = False + if is_prow_environment(): + from tests.e2e.utils.prow_utils import disrupt_llama_stack_pod + + context.llama_stack_was_running = disrupt_llama_stack_pod() + return + + # Docker-based disruption try: result = subprocess.run( ["docker", "inspect", "-f", "{{.State.Running}}", "llama-stack"], diff --git a/tests/e2e/features/steps/llm_query_response.py b/tests/e2e/features/steps/llm_query_response.py index 732f6e291..6e39b2a7c 100644 --- a/tests/e2e/features/steps/llm_query_response.py +++ b/tests/e2e/features/steps/llm_query_response.py @@ -1,12 +1,14 @@ """LLM query and response steps.""" import json +import os import requests from behave import then, step # pyright: ignore[reportAttributeAccessIssue] from behave.runner import Context from tests.e2e.utils.utils import replace_placeholders -DEFAULT_LLM_TIMEOUT = 60 +# Longer timeout for Prow/OpenShift with CPU-based vLLM +DEFAULT_LLM_TIMEOUT = 180 if os.getenv("RUNNING_PROW") else 60 @step("I wait for the response to be completed") diff --git a/tests/e2e/features/steps/rbac.py b/tests/e2e/features/steps/rbac.py index babaa0be5..4a8cb3e6d 100644 --- a/tests/e2e/features/steps/rbac.py +++ b/tests/e2e/features/steps/rbac.py @@ -7,7 +7,10 @@ def get_test_tokens() -> dict[str, str]: - """Fetch test tokens from the mock JWKS server.""" + """Fetch test tokens from the mock JWKS server. + + In Prow environment, mock-jwks is port-forwarded to localhost:8000. + """ jwks_host = os.getenv("E2E_JWKS_HOSTNAME", "localhost") jwks_port = os.getenv("E2E_JWKS_PORT", "8000") tokens_url = f"http://{jwks_host}:{jwks_port}/tokens" diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py new file mode 100644 index 000000000..05caead78 --- /dev/null +++ b/tests/e2e/utils/prow_utils.py @@ -0,0 +1,215 @@ +"""Prow/OpenShift-specific utility functions for E2E tests. + +This module contains all functions that interact with OpenShift via the `oc` CLI +and are only used when running tests in the Prow CI environment. +""" + +import os +import subprocess +import tempfile + + +def get_namespace() -> str: + """Get the Kubernetes namespace for Prow environment.""" + return os.getenv("NAMESPACE", "e2e-rhoai-dsc") + + +# Mapping from container names (used in tests) to pod names (used in OpenShift) +_POD_NAME_MAP = { + "lightspeed-stack": "lightspeed-stack-service", + "llama-stack": "llama-stack-service", +} + + +def get_pod_name(container_name: str) -> str: + """Map container name to OpenShift pod name.""" + return _POD_NAME_MAP.get(container_name, container_name) + + +def _get_e2e_ops_script() -> str: + """Get the path to the consolidated e2e-ops.sh script.""" + tests_dir = os.path.dirname( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + ) + return os.path.join(tests_dir, "e2e-prow/rhoai/scripts/e2e-ops.sh") + + +def run_e2e_ops( + command: str, args: list[str] | None = None, timeout: int = 180 +) -> subprocess.CompletedProcess: + """Run a command via the consolidated e2e-ops.sh script. + + Args: + command: The command to run (e.g., "restart-lightspeed", "wait-for-pod"). + args: Optional list of arguments to pass to the command. + timeout: Timeout in seconds. + + Returns: + CompletedProcess object with stdout/stderr. + """ + script_path = _get_e2e_ops_script() + cmd = ["bash", script_path, command] + (args or []) + return subprocess.run( + cmd, + env={**os.environ, "NAMESPACE": get_namespace()}, + capture_output=True, + text=True, + timeout=timeout, + ) + + +def wait_for_pod_health(pod_name: str, max_attempts: int = 12) -> None: + """Wait for pod to be ready in OpenShift/Prow environment.""" + actual_pod_name = get_pod_name(pod_name) + try: + result = run_e2e_ops("wait-for-pod", [actual_pod_name, str(max_attempts)]) + print(result.stdout, end="") + if result.returncode != 0: + print(result.stderr, end="") + except subprocess.TimeoutExpired: + print(f"Timeout waiting for pod {actual_pod_name}") + + +def restart_pod(container_name: str) -> None: + """Restart lightspeed-stack pod in OpenShift/Prow environment.""" + try: + result = run_e2e_ops("restart-lightspeed", timeout=120) + print(result.stdout, end="") + if result.returncode != 0: + print(result.stderr, end="") + raise subprocess.CalledProcessError( + result.returncode, "restart-lightspeed" + ) + except subprocess.TimeoutExpired as e: + print(f"Failed to restart pod {container_name}: {e}") + raise + + +def restore_llama_stack_pod() -> None: + """Restore Llama Stack pod in Prow/OpenShift environment.""" + try: + result = run_e2e_ops("restart-llama-stack", timeout=180) + print(result.stdout, end="") + if result.returncode != 0: + print(result.stderr, end="") + else: + print("✓ Llama Stack pod restored successfully") + except subprocess.TimeoutExpired: + print("Warning: Timeout while restoring Llama Stack pod") + + +def disrupt_llama_stack_pod() -> bool: + """Disrupt llama-stack connection in Prow/OpenShift environment. + + Returns: + True if the pod was running and has been disrupted, False otherwise. + """ + try: + result = run_e2e_ops("disrupt-llama-stack", timeout=90) + print(result.stdout, end="") + + # Exit code 0 = disrupted (was running), exit code 2 = was not running + if result.returncode == 0: + return True + elif result.returncode == 2: + return False + else: + print(result.stderr, end="") + return False + + except subprocess.TimeoutExpired: + print("Warning: Timeout while disrupting Llama Stack connection") + return False + + +# In-memory storage for ConfigMap backups in Prow environment +_configmap_backups: dict[str, str] = {} + + +def backup_configmap_to_memory() -> str: + """Backup the current ConfigMap content to memory.""" + namespace = get_namespace() + configmap_name = "lightspeed-stack-config" + backup_key = f"{namespace}/{configmap_name}" + + if backup_key in _configmap_backups: + print(f"ConfigMap backup already exists for {backup_key}") + return backup_key + + print(f"Backing up ConfigMap {configmap_name} to memory...") + + try: + result = run_e2e_ops("get-configmap-content", [configmap_name], timeout=30) + if result.returncode != 0: + raise subprocess.CalledProcessError( + result.returncode, "get-configmap-content", result.stderr + ) + + _configmap_backups[backup_key] = result.stdout + print(f"ConfigMap backed up to memory ({len(result.stdout)} bytes)") + return backup_key + + except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: + print(f"Failed to backup ConfigMap: {e}") + raise + + +def remove_configmap_backup(backup_key: str) -> None: + """Remove a ConfigMap backup from memory.""" + if backup_key in _configmap_backups: + del _configmap_backups[backup_key] + print(f"ConfigMap backup {backup_key} removed from memory") + + +def _recreate_configmap(configmap_name: str, source_file: str) -> None: + """Delete and recreate a ConfigMap from a file. + + Args: + configmap_name: Name of the ConfigMap. + source_file: Path to the file to create the ConfigMap from. + """ + result = run_e2e_ops("update-configmap", [configmap_name, source_file], timeout=60) + if result.returncode != 0: + raise subprocess.CalledProcessError( + result.returncode, "update-configmap", result.stderr + ) + + +def update_config_configmap(source: str) -> None: + """Update the lightspeed-stack-config ConfigMap with new config in Prow environment. + + Args: + source: Either a file path or a backup key from _configmap_backups. + """ + configmap_name = "lightspeed-stack-config" + + # Check if source is a backup key (restore from memory) + if source in _configmap_backups: + config_content = _configmap_backups[source] + print(f"Restoring ConfigMap {configmap_name} from memory backup...") + + # Write content to temp file (oc create configmap requires a file) + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + f.write(config_content) + temp_path = f.name + + try: + _recreate_configmap(configmap_name, temp_path) + print(f"✓ ConfigMap {configmap_name} restored successfully") + except subprocess.CalledProcessError as e: + print(f"Failed to restore ConfigMap: {e}") + raise + finally: + if os.path.exists(temp_path): + os.remove(temp_path) + return + + # Otherwise, source is a file path + print(f"Updating ConfigMap {configmap_name} with config from {source}...") + + try: + _recreate_configmap(configmap_name, source) + print(f"ConfigMap {configmap_name} updated successfully") + except subprocess.CalledProcessError as e: + print(f"Failed to update ConfigMap: {e}") + raise diff --git a/tests/e2e/utils/utils.py b/tests/e2e/utils/utils.py index 9c11dd9a6..6b73a0e1f 100644 --- a/tests/e2e/utils/utils.py +++ b/tests/e2e/utils/utils.py @@ -9,6 +9,19 @@ import jsonschema from behave.runner import Context +from tests.e2e.utils.prow_utils import ( + backup_configmap_to_memory, + remove_configmap_backup, + restart_pod, + update_config_configmap, + wait_for_pod_health, +) + + +def is_prow_environment() -> bool: + """Check if running in Prow/OpenShift environment.""" + return os.getenv("RUNNING_PROW") is not None + def normalize_endpoint(endpoint: str) -> str: """Normalize endpoint to be added into the URL. @@ -78,6 +91,10 @@ def wait_for_container_health(container_name: str, max_attempts: int = 3) -> Non container_name (str): Docker container name or ID to check. max_attempts (int): Maximum number of health check attempts (default 3). """ + if is_prow_environment(): + wait_for_pod_health(container_name, max_attempts) + return + for attempt in range(max_attempts): try: result = subprocess.run( @@ -167,6 +184,10 @@ def switch_config( written due to permissions. OSError: For other OS-related failures during the copy operation. """ + if is_prow_environment(): + update_config_configmap(source_path) + return + try: shutil.copy(source_path, destination_path) except (FileNotFoundError, PermissionError, OSError) as e: @@ -188,6 +209,9 @@ def create_config_backup(config_path: str) -> str: PermissionError: If the process lacks permission to read or write the files. OSError: For other OS-level errors encountered while copying. """ + if is_prow_environment(): + return backup_configmap_to_memory() + backup_file = f"{config_path}.backup" if not os.path.exists(backup_file): try: @@ -211,6 +235,10 @@ def remove_config_backup(backup_path: str) -> None: Parameters: backup_path (str): Filesystem path to the backup file to remove. """ + if is_prow_environment(): + remove_configmap_backup(backup_path) + return + if os.path.exists(backup_path): try: os.remove(backup_path) @@ -228,6 +256,10 @@ def restart_container(container_name: str) -> None: subprocess.CalledProcessError: if the `docker restart` command fails. subprocess.TimeoutExpired: if the `docker restart` command times out. """ + if is_prow_environment(): + restart_pod(container_name) + return + try: subprocess.run( ["docker", "restart", container_name], From fa81915d6e95027119f53adcad0623e619205c42 Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Thu, 19 Feb 2026 09:56:51 +0100 Subject: [PATCH 02/10] fix black --- tests/e2e/utils/prow_utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py index 05caead78..1d63d3fe5 100644 --- a/tests/e2e/utils/prow_utils.py +++ b/tests/e2e/utils/prow_utils.py @@ -77,9 +77,7 @@ def restart_pod(container_name: str) -> None: print(result.stdout, end="") if result.returncode != 0: print(result.stderr, end="") - raise subprocess.CalledProcessError( - result.returncode, "restart-lightspeed" - ) + raise subprocess.CalledProcessError(result.returncode, "restart-lightspeed") except subprocess.TimeoutExpired as e: print(f"Failed to restart pod {container_name}: {e}") raise From 123ba8c0469cb071b0d0204afb91eb20259d9a5b Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Thu, 19 Feb 2026 10:29:21 +0100 Subject: [PATCH 03/10] restore original timeout --- tests/e2e/features/environment.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/e2e/features/environment.py b/tests/e2e/features/environment.py index b731ca743..543803f4c 100644 --- a/tests/e2e/features/environment.py +++ b/tests/e2e/features/environment.py @@ -232,7 +232,7 @@ def _restore_llama_stack(context: Context) -> None: # Wait for the service to be healthy print("Restoring Llama Stack connection...") - time.sleep(5) + time.sleep(20) # Check if it's healthy for attempt in range(6): # Try for 30 seconds @@ -254,7 +254,7 @@ def _restore_llama_stack(context: Context) -> None: print("✓ Llama Stack connection restored successfully") break except subprocess.TimeoutExpired: - print(f"⏱Health check timed out on attempt {attempt + 1}/6") + print(f"⏱ Health check timed out on attempt {attempt + 1}/6") if attempt < 5: print( From ae0d01e1ac3a8e0ebf4c6895a5944d6d7b536ad8 Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Thu, 19 Feb 2026 11:07:51 +0100 Subject: [PATCH 04/10] fixes from review comments --- tests/e2e-prow/rhoai/pipeline.sh | 2 ++ tests/e2e/utils/prow_utils.py | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/tests/e2e-prow/rhoai/pipeline.sh b/tests/e2e-prow/rhoai/pipeline.sh index 6e66ff94f..860cf2848 100755 --- a/tests/e2e-prow/rhoai/pipeline.sh +++ b/tests/e2e-prow/rhoai/pipeline.sh @@ -183,6 +183,8 @@ oc wait pod/mock-jwks pod/mcp-mock-server \ oc get pods -n "$NAMESPACE" | grep -E "mock-jwks|mcp-mock" || true oc describe pod mock-jwks -n "$NAMESPACE" 2>/dev/null | tail -20 || true oc describe pod mcp-mock-server -n "$NAMESPACE" 2>/dev/null | tail -20 || true + echo "❌ Mock servers failed to become ready" + exit 1 } echo "✅ Mock servers deployed" diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py index 1d63d3fe5..ad7d805a3 100644 --- a/tests/e2e/utils/prow_utils.py +++ b/tests/e2e/utils/prow_utils.py @@ -66,8 +66,12 @@ def wait_for_pod_health(pod_name: str, max_attempts: int = 12) -> None: print(result.stdout, end="") if result.returncode != 0: print(result.stderr, end="") + raise subprocess.CalledProcessError( ++ result.returncode, "wait-for-pod" ++ ) except subprocess.TimeoutExpired: print(f"Timeout waiting for pod {actual_pod_name}") + raise def restart_pod(container_name: str) -> None: From 8187346bbbe4680980f7442f2eab40c8a93c6e79 Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Thu, 19 Feb 2026 11:18:13 +0100 Subject: [PATCH 05/10] formatting fix --- tests/e2e/utils/prow_utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py index ad7d805a3..6f138fe15 100644 --- a/tests/e2e/utils/prow_utils.py +++ b/tests/e2e/utils/prow_utils.py @@ -66,9 +66,7 @@ def wait_for_pod_health(pod_name: str, max_attempts: int = 12) -> None: print(result.stdout, end="") if result.returncode != 0: print(result.stderr, end="") - raise subprocess.CalledProcessError( -+ result.returncode, "wait-for-pod" -+ ) + raise subprocess.CalledProcessError(+result.returncode, "wait-for-pod") except subprocess.TimeoutExpired: print(f"Timeout waiting for pod {actual_pod_name}") raise From 6ac7d88053efeb93bc339c2b3282fbf2d5a9be38 Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Thu, 19 Feb 2026 14:05:20 +0100 Subject: [PATCH 06/10] enable secrets in prow --- tests/e2e-prow/rhoai/pipeline.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/e2e-prow/rhoai/pipeline.sh b/tests/e2e-prow/rhoai/pipeline.sh index 860cf2848..22d2bdfb3 100755 --- a/tests/e2e-prow/rhoai/pipeline.sh +++ b/tests/e2e-prow/rhoai/pipeline.sh @@ -21,10 +21,10 @@ export LLAMA_STACK_IMAGE # 2. ENVIRONMENT SETUP #======================================== echo "===== Setting up environment variables =====" -#export HUGGING_FACE_HUB_TOKEN=$(cat /var/run/huggingface/hf-token-ces-lcore-test || true) -#export VLLM_API_KEY=$(cat /var/run/vllm/vllm-api-key-lcore-test || true) -#export QUAY_ROBOT_NAME=$(cat /var/run/quay-aipcc-name/lcore-quay-name-lcore-test || true) -#export QUAY_ROBOT_PASSWORD=$(cat /var/run/quay-aipcc-password/lcore-quay-password-lcore-test || true) +export HUGGING_FACE_HUB_TOKEN=$(cat /var/run/huggingface/hf-token-ces-lcore-test || true) +export VLLM_API_KEY=$(cat /var/run/vllm/vllm-api-key-lcore-test || true) +export QUAY_ROBOT_NAME=$(cat /var/run/quay-aipcc-name/lcore-quay-name-lcore-test || true) +export QUAY_ROBOT_PASSWORD=$(cat /var/run/quay-aipcc-password/lcore-quay-password-lcore-test || true) [[ -n "$HUGGING_FACE_HUB_TOKEN" ]] && echo "✅ HUGGING_FACE_HUB_TOKEN is set" || { echo "❌ Missing HUGGING_FACE_HUB_TOKEN"; exit 1; } From a867ebfdddf0a4d1ad59efe45ad498fc9e2cb2d0 Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Tue, 24 Feb 2026 09:12:12 +0100 Subject: [PATCH 07/10] add debug logs with timestamps --- tests/e2e-prow/rhoai/run-tests.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/e2e-prow/rhoai/run-tests.sh b/tests/e2e-prow/rhoai/run-tests.sh index 41aab3441..41f5020e3 100755 --- a/tests/e2e-prow/rhoai/run-tests.sh +++ b/tests/e2e-prow/rhoai/run-tests.sh @@ -4,17 +4,22 @@ set -e # Go to repo root (run-tests.sh is in tests/e2e-prow/rhoai/) cd "$(dirname "$0")/../../.." +# Timestamps to pinpoint where time is spent (e.g. if Prow 2h timeout is hit) +ts() { echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] $*"; } + # FAISS_VECTOR_STORE_ID should be exported by pipeline.sh if [ -z "$FAISS_VECTOR_STORE_ID" ]; then echo "❌ FAISS_VECTOR_STORE_ID is not set - should be exported by pipeline.sh" exit 1 fi +ts "Start run-tests.sh" echo "Running tests from: $(pwd)" echo "E2E_LSC_HOSTNAME: $E2E_LSC_HOSTNAME" echo "FAISS_VECTOR_STORE_ID: $FAISS_VECTOR_STORE_ID" # Wait for service to be ready (retry up to 60 seconds) +ts "Start: wait for service" echo "Waiting for service to be ready..." for i in $(seq 1 12); do if curl -sf http://$E2E_LSC_HOSTNAME:8080/v1/models > /dev/null 2>&1; then @@ -28,10 +33,18 @@ for i in $(seq 1 12); do echo " Attempt $i/12 - service not ready, waiting 5s..." sleep 5 done +ts "End: wait for service" +ts "Start: pip install uv" echo "Installing test dependencies..." pip install uv +ts "End: pip install uv" + +ts "Start: uv sync" uv sync +ts "End: uv sync" +ts "Start: make test-e2e" echo "Running e2e test suite..." make test-e2e +ts "End: make test-e2e" From 6d1e377ca1f49a6737986f7ff0e9dc460d768404 Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Tue, 24 Feb 2026 14:40:43 +0100 Subject: [PATCH 08/10] add longer timeout to port forward --- tests/e2e-prow/rhoai/pipeline.sh | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/tests/e2e-prow/rhoai/pipeline.sh b/tests/e2e-prow/rhoai/pipeline.sh index 22d2bdfb3..0c1c43b9f 100755 --- a/tests/e2e-prow/rhoai/pipeline.sh +++ b/tests/e2e-prow/rhoai/pipeline.sh @@ -21,10 +21,10 @@ export LLAMA_STACK_IMAGE # 2. ENVIRONMENT SETUP #======================================== echo "===== Setting up environment variables =====" -export HUGGING_FACE_HUB_TOKEN=$(cat /var/run/huggingface/hf-token-ces-lcore-test || true) -export VLLM_API_KEY=$(cat /var/run/vllm/vllm-api-key-lcore-test || true) -export QUAY_ROBOT_NAME=$(cat /var/run/quay-aipcc-name/lcore-quay-name-lcore-test || true) -export QUAY_ROBOT_PASSWORD=$(cat /var/run/quay-aipcc-password/lcore-quay-password-lcore-test || true) +# export HUGGING_FACE_HUB_TOKEN=$(cat /var/run/huggingface/hf-token-ces-lcore-test || true) +# export VLLM_API_KEY=$(cat /var/run/vllm/vllm-api-key-lcore-test || true) +# export QUAY_ROBOT_NAME=$(cat /var/run/quay-aipcc-name/lcore-quay-name-lcore-test || true) +# export QUAY_ROBOT_PASSWORD=$(cat /var/run/quay-aipcc-password/lcore-quay-password-lcore-test || true) [[ -n "$HUGGING_FACE_HUB_TOKEN" ]] && echo "✅ HUGGING_FACE_HUB_TOKEN is set" || { echo "❌ Missing HUGGING_FACE_HUB_TOKEN"; exit 1; } @@ -301,7 +301,27 @@ echo "Starting port-forward for mock-jwks..." oc port-forward svc/mock-jwks 8000:8000 -n $NAMESPACE & PF_JWKS_PID=$! -sleep 10 +# Wait for port-forward to be usable (app may not be listening immediately; port-forward can drop) +echo "Waiting for port-forward to lightspeed-stack to be ready..." +for i in $(seq 1 36); do + if curl -sf http://localhost:8080/v1/models > /dev/null 2>&1; then + echo "✅ Port-forward ready after $(( i * 5 ))s" + break + fi + if [ $i -eq 36 ]; then + echo "❌ Port-forward to lightspeed-stack never became ready (3 min)" + kill $PF_LCS_PID 2>/dev/null || true + kill $PF_JWKS_PID 2>/dev/null || true + exit 1 + fi + # If port-forward process died, restart it (e.g. "connection refused" / "lost connection to pod") + if ! kill -0 $PF_LCS_PID 2>/dev/null; then + echo "Port-forward died, restarting (attempt $i)..." + oc port-forward svc/lightspeed-stack-service-svc 8080:8080 -n $NAMESPACE & + PF_LCS_PID=$! + fi + sleep 5 +done export E2E_LSC_HOSTNAME="localhost" export E2E_JWKS_HOSTNAME="localhost" From 076c41d221202e1e52f493ef1d0035364754b0f3 Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Tue, 24 Feb 2026 19:27:46 +0100 Subject: [PATCH 09/10] end test run without exceptions --- tests/e2e-prow/rhoai/pipeline.sh | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tests/e2e-prow/rhoai/pipeline.sh b/tests/e2e-prow/rhoai/pipeline.sh index 0c1c43b9f..4ae635fb5 100755 --- a/tests/e2e-prow/rhoai/pipeline.sh +++ b/tests/e2e-prow/rhoai/pipeline.sh @@ -338,15 +338,21 @@ echo "===== Running E2E tests =====" # Ensure run-tests.sh is executable chmod +x ./run-tests.sh -# Disable exit on error to capture test exit code +# Run tests and cleanup port-forwards. Disable ERR trap so we can capture test exit code and reap +# killed port-forwards without the trap firing (ERR fires on any non-zero exit, not only when set -e would exit). +trap - ERR set +e +export E2E_EXIT_CODE_FILE="${PIPELINE_DIR}/.e2e_exit_code" ./run-tests.sh -TEST_EXIT_CODE=$? -set -e - -# Cleanup port-forwards +# Read exit code from file so we get the real test result (shell can overwrite $? with "PID Killed" before we use it) +TEST_EXIT_CODE=$(cat "$E2E_EXIT_CODE_FILE" 2>/dev/null || echo 1) +# Kill first so wait doesn't block (if a port-forward is still running, wait would hang) kill $PF_LCS_PID 2>/dev/null || true kill $PF_JWKS_PID 2>/dev/null || true +wait $PF_LCS_PID 2>/dev/null || true +wait $PF_JWKS_PID 2>/dev/null || true +set -e +trap 'echo "❌ Pipeline failed at line $LINENO"; exit 1' ERR echo "===== E2E COMPLETE =====" From 67b55d12fc37627e0afd19d445c7fd2f46c59eb1 Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Tue, 24 Feb 2026 19:35:03 +0100 Subject: [PATCH 10/10] enable env --- tests/e2e-prow/rhoai/pipeline.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/e2e-prow/rhoai/pipeline.sh b/tests/e2e-prow/rhoai/pipeline.sh index 4ae635fb5..e94b7e6f8 100755 --- a/tests/e2e-prow/rhoai/pipeline.sh +++ b/tests/e2e-prow/rhoai/pipeline.sh @@ -21,10 +21,10 @@ export LLAMA_STACK_IMAGE # 2. ENVIRONMENT SETUP #======================================== echo "===== Setting up environment variables =====" -# export HUGGING_FACE_HUB_TOKEN=$(cat /var/run/huggingface/hf-token-ces-lcore-test || true) -# export VLLM_API_KEY=$(cat /var/run/vllm/vllm-api-key-lcore-test || true) -# export QUAY_ROBOT_NAME=$(cat /var/run/quay-aipcc-name/lcore-quay-name-lcore-test || true) -# export QUAY_ROBOT_PASSWORD=$(cat /var/run/quay-aipcc-password/lcore-quay-password-lcore-test || true) +export HUGGING_FACE_HUB_TOKEN=$(cat /var/run/huggingface/hf-token-ces-lcore-test || true) +export VLLM_API_KEY=$(cat /var/run/vllm/vllm-api-key-lcore-test || true) +export QUAY_ROBOT_NAME=$(cat /var/run/quay-aipcc-name/lcore-quay-name-lcore-test || true) +export QUAY_ROBOT_PASSWORD=$(cat /var/run/quay-aipcc-password/lcore-quay-password-lcore-test || true) [[ -n "$HUGGING_FACE_HUB_TOKEN" ]] && echo "✅ HUGGING_FACE_HUB_TOKEN is set" || { echo "❌ Missing HUGGING_FACE_HUB_TOKEN"; exit 1; }