From f5b5a2de649ce2f58be923b09dfc6b68d1e2e8c7 Mon Sep 17 00:00:00 2001 From: "Jackson Ernst - jaxer.eth" <51183683+jaxernst@users.noreply.github.com> Date: Mon, 9 Feb 2026 11:47:07 -0800 Subject: [PATCH 1/2] Sync from lasso-cloud: provider health monitoring + dashboard cleanup Core (lib/lasso/): - Health probe batch coordinator: improved coordination logic - Provider pool: rate limit awareness in health tiering, probe-based health reporting - Selection: rate-limited provider tiering in strategy selection - Circuit breaker: remove redundant error handling - Error classification: updated rate limit detection - JSONRPC error: improved error code handling Dashboard (lib/lasso_web/): - Consolidate format helpers into Formatting module (format_latency, format_rps, format_region_name, success_rate_color) - Remove duplicated inline helpers from chain_details_panel and metrics_tab - Refactor provider_details_panel block height resolution into helper functions - Use cluster_block_heights for real-time consensus height in chain details - Make strategy_display_name/strategy_description public in endpoint_helpers - Remove unused get_strategy_description from helpers - Remove unused chain status route and controller action - Metrics tab: filter nil/unknown regions, use Formatting delegates Tests: - New: error_test, provider_pool_probe_health_test, selection_rate_limit_tiering_test - Updated: error_classification_test, circuit_breaker_test, integration tests Co-Authored-By: Claude Opus 4.6 --- .../core/health_probe/batch_coordinator.ex | 36 ++- lib/lasso/core/providers/provider_pool.ex | 81 ++++++- lib/lasso/core/selection/selection.ex | 19 +- lib/lasso/core/support/circuit_breaker.ex | 4 - .../core/support/error_classification.ex | 4 +- lib/lasso/jsonrpc/error.ex | 10 +- lib/lasso_web/controllers/chain_controller.ex | 5 - .../components/chain_details_panel.ex | 38 +-- .../dashboard/components/metrics_tab.ex | 17 +- .../components/provider_details_panel.ex | 228 ++++++++---------- lib/lasso_web/dashboard/endpoint_helpers.ex | 20 +- lib/lasso_web/dashboard/formatting.ex | 53 ++++ lib/lasso_web/dashboard/helpers.ex | 20 -- lib/lasso_web/dashboard/status_helpers.ex | 24 +- lib/lasso_web/router.ex | 1 - .../health_probe_integration_test.exs | 12 +- .../request_pipeline_integration_test.exs | 51 ++-- .../support/error_classification_test.exs | 4 +- test/lasso/jsonrpc/error_test.exs | 69 ++++++ test/lasso/rpc/circuit_breaker_test.exs | 36 ++- .../rpc/provider_pool_probe_health_test.exs | 179 ++++++++++++++ .../rpc/selection_rate_limit_tiering_test.exs | 140 +++++++++++ 22 files changed, 771 insertions(+), 280 deletions(-) create mode 100644 test/lasso/jsonrpc/error_test.exs create mode 100644 test/lasso/rpc/provider_pool_probe_health_test.exs create mode 100644 test/lasso/rpc/selection_rate_limit_tiering_test.exs diff --git a/lib/lasso/core/health_probe/batch_coordinator.ex b/lib/lasso/core/health_probe/batch_coordinator.ex index 84b84e55..55f2c838 100644 --- a/lib/lasso/core/health_probe/batch_coordinator.ex +++ b/lib/lasso/core/health_probe/batch_coordinator.ex @@ -31,8 +31,8 @@ defmodule Lasso.HealthProbe.BatchCoordinator do require Logger alias Lasso.Core.Support.CircuitBreaker + alias Lasso.RPC.{Channel, ProviderPool, Response, TransportRegistry} alias Lasso.RPC.Transport.WebSocket.Connection, as: WSConnection - alias Lasso.RPC.{Channel, Response, TransportRegistry} @tick_interval_ms 200 @default_timeout_ms 5_000 @@ -500,6 +500,14 @@ defmodule Lasso.HealthProbe.BatchCoordinator do cb_id = {state.profile, state.chain, provider.provider_id, :http} CircuitBreaker.signal_recovery(cb_id) + ProviderPool.update_probe_health( + state.profile, + state.chain, + provider.provider_id, + :http, + :success + ) + new_consecutive_successes = provider.consecutive_successes + 1 was_failing = provider.consecutive_failures >= @max_consecutive_failures_before_warn @@ -535,8 +543,13 @@ defmodule Lasso.HealthProbe.BatchCoordinator do end defp handle_http_probe_failure(provider, state, reason, latency_ms, now) do - cb_id = {state.profile, state.chain, provider.provider_id, :http} - CircuitBreaker.record_failure(cb_id, reason) + ProviderPool.update_probe_health( + state.profile, + state.chain, + provider.provider_id, + :http, + {:failure, reason} + ) new_consecutive_failures = provider.consecutive_failures + 1 @@ -568,6 +581,14 @@ defmodule Lasso.HealthProbe.BatchCoordinator do CircuitBreaker.signal_recovery(cb_id) end + ProviderPool.update_probe_health( + state.profile, + state.chain, + provider.provider_id, + :ws, + :success + ) + new_consecutive_successes = provider.ws_consecutive_successes + 1 was_failing = provider.ws_consecutive_failures >= @max_consecutive_failures_before_warn @@ -603,8 +624,13 @@ defmodule Lasso.HealthProbe.BatchCoordinator do end defp handle_ws_probe_failure(provider, state, reason, latency_ms, now) do - cb_id = {state.profile, state.chain, provider.provider_id, :ws} - CircuitBreaker.record_failure(cb_id, reason) + ProviderPool.update_probe_health( + state.profile, + state.chain, + provider.provider_id, + :ws, + {:failure, reason} + ) new_consecutive_failures = provider.ws_consecutive_failures + 1 diff --git a/lib/lasso/core/providers/provider_pool.ex b/lib/lasso/core/providers/provider_pool.ex index 420082fe..ba0514b8 100644 --- a/lib/lasso/core/providers/provider_pool.ex +++ b/lib/lasso/core/providers/provider_pool.ex @@ -493,12 +493,40 @@ defmodule Lasso.RPC.ProviderPool do report_probe_results("default", chain_name, results) end + @deprecated "Use update_probe_health/5 instead" @spec report_probe_results(profile, chain_name, [map()]) :: :ok def report_probe_results(profile, chain_name, results) when is_binary(profile) and is_list(results) do GenServer.cast(via_name(profile, chain_name), {:probe_results, results}) end + @doc """ + Updates a provider's health status based on probe results. + + This is called by BatchCoordinator to report individual probe outcomes. + Only updates ProviderPool health state (http_status/ws_status) — does NOT + interact with CircuitBreaker (avoids double-reporting). + + Probe success transitions :connecting → :healthy. For providers already in + a live-traffic-derived state, probe success only updates last_health_check. + Probe failure transitions :connecting → :degraded/:unhealthy based on + consecutive failures. + """ + @spec update_probe_health( + profile, + chain_name, + provider_id, + :http | :ws, + :success | {:failure, term()} + ) :: :ok + def update_probe_health(profile, chain_name, provider_id, transport, result) + when is_binary(profile) and transport in [:http, :ws] do + GenServer.cast( + via_name(profile, chain_name), + {:update_probe_health, provider_id, transport, result} + ) + end + @doc """ Reports a newHeads update from WebSocket subscription (future use). @@ -928,6 +956,55 @@ defmodule Lasso.RPC.ProviderPool do def handle_cast({:report_failure, provider_id, error, nil}, state), do: {:noreply, update_provider_failure(state, provider_id, error)} + @impl true + def handle_cast({:update_probe_health, provider_id, transport, result}, state) do + case Map.get(state.providers, provider_id) do + nil -> + {:noreply, state} + + provider -> + status_field = if transport == :http, do: :http_status, else: :ws_status + current_transport_status = Map.get(provider, status_field) + now = System.system_time(:millisecond) + + updated = + case result do + :success -> + if current_transport_status == :connecting do + provider + |> Map.put(status_field, :healthy) + |> Map.put(:consecutive_successes, provider.consecutive_successes + 1) + |> Map.put(:consecutive_failures, 0) + |> Map.put(:last_health_check, now) + |> then(&Map.put(&1, :status, derive_aggregate_status(&1))) + else + Map.put(provider, :last_health_check, now) + end + + {:failure, reason} -> + new_failures = provider.consecutive_failures + 1 + new_status = if new_failures >= @failure_threshold, do: :unhealthy, else: :degraded + + if current_transport_status == :connecting do + provider + |> Map.put(status_field, new_status) + |> Map.put(:consecutive_failures, new_failures) + |> Map.put(:consecutive_successes, 0) + |> Map.put(:last_error, reason) + |> Map.put(:last_health_check, now) + |> then(&Map.put(&1, :status, derive_aggregate_status(&1))) + else + provider + |> Map.put(:last_error, reason) + |> Map.put(:last_health_check, now) + end + end + + new_state = put_provider_and_refresh(state, provider_id, updated) + {:noreply, new_state} + end + end + # Async recovery time update from background Task (spawned in update_recovery_time_for_circuit) @impl true def handle_cast({:update_recovery_time_async, provider_id, transport, recovery_time}, state) do @@ -1752,10 +1829,6 @@ defmodule Lasso.RPC.ProviderPool do # Provider stays healthy - RateLimitState is the authoritative source for rate limit status retry_after_ms = RateLimitState.extract_retry_after(jerr.data) - # Report to circuit breaker (rate limit threshold is lower than other errors) - cb_id = {state.profile, state.chain_name, provider_id, transport} - CircuitBreaker.record_failure(cb_id, jerr) - # Only update last_error for debugging - don't change health status updated = provider diff --git a/lib/lasso/core/selection/selection.ex b/lib/lasso/core/selection/selection.ex index 17fbc2e2..59098c17 100644 --- a/lib/lasso/core/selection/selection.ex +++ b/lib/lasso/core/selection/selection.ex @@ -179,7 +179,6 @@ defmodule Lasso.RPC.Selection do ) # Build circuit state lookup map: {provider_id, transport} => :closed | :half_open - # Use defensive access in case circuit_state field is missing or nil circuit_state_map = provider_candidates |> Enum.flat_map(fn %{id: provider_id} = candidate -> @@ -192,6 +191,12 @@ defmodule Lasso.RPC.Selection do end) |> Map.new() + # Build rate limit lookup map: provider_id => %{http: bool, ws: bool} + rate_limit_map = + provider_candidates + |> Enum.map(fn %{id: id, rate_limited: rl} -> {id, rl} end) + |> Map.new() + # Build channel candidates via TransportRegistry (enforces channel-level health/capabilities) # Map provider list into channels, lazily opening as needed registry_start = System.monotonic_time(:microsecond) @@ -249,7 +254,17 @@ defmodule Lasso.RPC.Selection do tiered_channels = closed_channels ++ half_open_channels - tiered_channels |> Enum.take(limit) + # Rate-limit tiering: deprioritize rate-limited channels (tried last, not excluded). + # Final order: closed+not-rl, closed+rl, half-open+not-rl, half-open+rl + {not_rate_limited, rate_limited} = + Enum.split_with(tiered_channels, fn channel -> + rl = Map.get(rate_limit_map, channel.provider_id, %{http: false, ws: false}) + not Map.get(rl, channel.transport, false) + end) + + final_channels = not_rate_limited ++ rate_limited + + final_channels |> Enum.take(limit) end @doc """ diff --git a/lib/lasso/core/support/circuit_breaker.ex b/lib/lasso/core/support/circuit_breaker.ex index 13310524..ea45da91 100644 --- a/lib/lasso/core/support/circuit_breaker.ex +++ b/lib/lasso/core/support/circuit_breaker.ex @@ -348,11 +348,7 @@ defmodule Lasso.Core.Support.CircuitBreaker do end defp do_init(profile, chain, provider_id, transport, config) do - # Default category-specific thresholds - # Rate limits should open circuit quickly (2 failures) - # Server errors are more tolerant (5 failures) as they may be transient default_category_thresholds = %{ - rate_limit: 2, server_error: 5, network_error: 3, timeout: 2, diff --git a/lib/lasso/core/support/error_classification.ex b/lib/lasso/core/support/error_classification.ex index e43ad9ab..e6a759ac 100644 --- a/lib/lasso/core/support/error_classification.ex +++ b/lib/lasso/core/support/error_classification.ex @@ -40,8 +40,9 @@ defmodule Lasso.Core.Support.ErrorClassification do - `:unknown_error` - Unclassified error (fallback category) **Circuit breaker penalty**: - - All categories count against circuit breaker EXCEPT `:capability_violation` + - All categories count against circuit breaker EXCEPT `:capability_violation` and `:rate_limit` - Capability violations represent permanent constraints, not transient failures + - Rate limits are temporary backpressure handled by RateLimitState tiering, not circuit breakers """ # =========================================================================== @@ -264,6 +265,7 @@ defmodule Lasso.Core.Support.ErrorClassification do """ @spec breaker_penalty?(atom()) :: boolean() def breaker_penalty?(:capability_violation), do: false + def breaker_penalty?(:rate_limit), do: false def breaker_penalty?(_category), do: true @doc """ diff --git a/lib/lasso/jsonrpc/error.ex b/lib/lasso/jsonrpc/error.ex index fe871573..8ae9204d 100644 --- a/lib/lasso/jsonrpc/error.ex +++ b/lib/lasso/jsonrpc/error.ex @@ -98,10 +98,16 @@ defmodule Lasso.JSONRPC.Error do Keyword.get(opts, :category) || ErrorClassification.categorize(normalized_code, message) retriable? = - Keyword.get(opts, :retriable?) || ErrorClassification.retriable?(normalized_code, message) + case Keyword.get(opts, :retriable?) do + nil -> ErrorClassification.retriable?(normalized_code, message) + value -> value + end breaker_penalty? = - Keyword.get(opts, :breaker_penalty?) || ErrorClassification.breaker_penalty?(category) + case Keyword.get(opts, :breaker_penalty?) do + nil -> ErrorClassification.breaker_penalty?(category) + value -> value + end %__MODULE__{ code: normalized_code, diff --git a/lib/lasso_web/controllers/chain_controller.ex b/lib/lasso_web/controllers/chain_controller.ex index 0c54a0c2..c49aa7a6 100644 --- a/lib/lasso_web/controllers/chain_controller.ex +++ b/lib/lasso_web/controllers/chain_controller.ex @@ -19,9 +19,4 @@ defmodule LassoWeb.ChainController do json(conn, %{chains: chains}) end - - @spec status(Plug.Conn.t(), map()) :: Plug.Conn.t() - def status(conn, %{"chain_id" => _chain_id}) do - json(conn, %{status: "not implemented"}) - end end diff --git a/lib/lasso_web/dashboard/components/chain_details_panel.ex b/lib/lasso_web/dashboard/components/chain_details_panel.ex index 38e0f231..757c4ec6 100644 --- a/lib/lasso_web/dashboard/components/chain_details_panel.ex +++ b/lib/lasso_web/dashboard/components/chain_details_panel.ex @@ -46,7 +46,10 @@ defmodule LassoWeb.Dashboard.Components.ChainDetailsPanel do socket |> assign(assigns) |> assign(:chain_connections, chain_connections) - |> assign(:consensus_height, find_consensus_height(chain_connections)) + |> assign( + :consensus_height, + find_consensus_height(chain_connections, assigns[:cluster_block_heights] || %{}) + ) |> assign(:chain_events, chain_events) |> assign(:available_node_ids, available_node_ids) |> assign(:show_region_tabs, length(available_node_ids) > 1) @@ -88,8 +91,17 @@ defmodule LassoWeb.Dashboard.Components.ChainDetailsPanel do {:noreply, socket} end - defp find_consensus_height(connections) do - Enum.find_value(connections, fn conn -> Map.get(conn, :consensus_height) end) + defp find_consensus_height(chain_connections, cluster_block_heights) do + chain_provider_ids = MapSet.new(chain_connections, & &1.id) + + realtime_max = + cluster_block_heights + |> Enum.filter(fn {{pid, _node}, _} -> pid in chain_provider_ids end) + |> Enum.map(fn {_, %{height: h}} -> h end) + |> Enum.max(fn -> nil end) + + realtime_max || + Enum.find_value(chain_connections, fn conn -> Map.get(conn, :consensus_height) end) end @impl true @@ -217,21 +229,9 @@ defmodule LassoWeb.Dashboard.Components.ChainDetailsPanel do """ end - defp format_latency(nil), do: "—" - defp format_latency(ms), do: "#{ms}ms" - - defp format_rps(rps) when rps > 0, do: "#{rps}" - defp format_rps(_), do: "0" - - defp success_color(rate) when rate >= 95.0, do: "text-emerald-400" - defp success_color(rate) when rate >= 80.0, do: "text-yellow-400" - defp success_color(_), do: "text-red-400" - - @strategy_labels %{ - "round-robin" => "Load Balanced", - "latency-weighted" => "Latency Weighted", - "fastest" => "Fastest" - } + defdelegate format_latency(ms), to: Formatting + defdelegate format_rps(rps), to: Formatting + defp success_color(rate), do: Formatting.success_rate_color(rate) attr(:chain, :string, required: true) attr(:selected_profile, :string, required: true) @@ -309,7 +309,7 @@ defmodule LassoWeb.Dashboard.Components.ChainDetailsPanel do defp strategy_button(assigns) do assigns = assigns - |> assign(:label, Map.get(@strategy_labels, assigns.strategy, assigns.strategy)) + |> assign(:label, EndpointHelpers.strategy_display_name(assigns.strategy)) ~H""" """ @@ -324,7 +323,7 @@ defmodule LassoWeb.Dashboard.Components.MetricsTab do ) ]} > - {nid} + {Formatting.format_region_name(nid)} """ @@ -446,12 +445,10 @@ defmodule LassoWeb.Dashboard.Components.MetricsTab do """ end - defp format_latency(nil), do: "—" - defp format_latency(ms), do: "#{Formatting.safe_round(ms, 0)}ms" + defdelegate format_latency(ms), to: Formatting - defp success_color(rate) when rate >= 0.99, do: "text-emerald-400" - defp success_color(rate) when rate >= 0.95, do: "text-yellow-400" - defp success_color(_), do: "text-red-400" + defp success_color(nil), do: Formatting.success_rate_color(nil) + defp success_color(rate), do: Formatting.success_rate_color(rate * 100) attr(:method_metrics, :list, required: true) attr(:selected_node_id, :string, default: "all") diff --git a/lib/lasso_web/dashboard/components/provider_details_panel.ex b/lib/lasso_web/dashboard/components/provider_details_panel.ex index af018188..617e6ff0 100644 --- a/lib/lasso_web/dashboard/components/provider_details_panel.ex +++ b/lib/lasso_web/dashboard/components/provider_details_panel.ex @@ -8,9 +8,6 @@ defmodule LassoWeb.Dashboard.Components.ProviderDetailsPanel do use LassoWeb, :live_component - alias Lasso.BlockSync.Registry, as: BlockSyncRegistry - alias Lasso.Config.ConfigStore - alias Lasso.RPC.ChainState alias LassoWeb.Components.DetailPanelComponents alias LassoWeb.Components.RegionSelector alias LassoWeb.Dashboard.{Formatting, Helpers, StatusHelpers} @@ -84,13 +81,16 @@ defmodule LassoWeb.Dashboard.Components.ProviderDetailsPanel do regions_with_issues = find_regions_with_issues(cluster_circuits, provider_id) # Compute derived data (with cached fallback for metrics) + connections = socket.assigns[:connections] || [] + sync_data = compute_sync_data( selected_region, region_data, provider_connection, cluster_block_heights, - provider_id + provider_id, + connections ) metrics_data = compute_metrics_data(region_data, cached_fallback) @@ -144,70 +144,34 @@ defmodule LassoWeb.Dashboard.Components.ProviderDetailsPanel do region_data, provider_connection, cluster_block_heights, - provider_id + provider_id, + connections ) do conn = provider_connection || %{} chain = Map.get(conn, :chain) + chain_consensus = compute_chain_consensus(cluster_block_heights, chain, connections) - # Get block height data based on mode {block_height, block_lag, consensus_height} = + resolve_block_heights( + selected_region, + region_data, + conn, + cluster_block_heights, + provider_id, + chain_consensus + ) + + optimistic_lag = if selected_region == "aggregate" do - # Aggregate mode: find highest block height across all regions for this provider - provider_heights = - cluster_block_heights - |> Enum.filter(fn {{pid, _region}, _data} -> pid == provider_id end) - |> Enum.map(fn {{_pid, _region}, data} -> data end) - - if provider_heights != [] do - # Take highest block height and lowest lag (best case from any node) - max_height = - provider_heights - |> Enum.map(& &1[:height]) - |> Enum.reject(&is_nil/1) - |> Enum.max(fn -> nil end) - - min_lag = - provider_heights - |> Enum.map(& &1[:lag]) - |> Enum.reject(&is_nil/1) - |> Enum.min(fn -> 0 end) - - consensus = region_data[:consensus_height] || Map.get(conn, :consensus_height) - {max_height, min_lag, consensus} - else - # Fall back to local data - { - region_data[:block_height] || Map.get(conn, :block_height), - region_data[:block_lag] || 0, - region_data[:consensus_height] || Map.get(conn, :consensus_height) - } + case StatusHelpers.calculate_optimistic_lag(chain, provider_id) do + {:ok, lag} -> lag + {:error, _} -> nil end else - # Per-region mode: use region-specific data from cluster_block_heights - region_height_data = Map.get(cluster_block_heights, {provider_id, selected_region}, %{}) - - { - region_height_data[:height] || region_data[:block_height] || - Map.get(conn, :block_height), - region_height_data[:lag] || region_data[:block_lag] || 0, - region_data[:consensus_height] || Map.get(conn, :consensus_height) - } + nil end - # Calculate optimistic lag for better UX - {optimistic_lag, _raw_lag} = - if selected_region == "aggregate" do - calculate_optimistic_lag(chain, provider_id) - else - {nil, nil} - end - - effective_lag = - cond do - is_integer(block_lag) and block_lag > 0 -> block_lag - optimistic_lag != nil -> abs(min(0, optimistic_lag)) - true -> Map.get(conn, :blocks_behind, 0) || 0 - end + effective_lag = resolve_effective_lag(block_lag, optimistic_lag, conn) %{ block_height: block_height, @@ -219,6 +183,82 @@ defmodule LassoWeb.Dashboard.Components.ProviderDetailsPanel do } end + defp resolve_block_heights( + "aggregate", + region_data, + conn, + cluster_block_heights, + provider_id, + chain_consensus + ) do + provider_heights = + cluster_block_heights + |> Enum.filter(fn {{pid, _region}, _data} -> pid == provider_id end) + |> Enum.map(fn {{_pid, _region}, data} -> data end) + + consensus = + chain_consensus || region_data[:consensus_height] || Map.get(conn, :consensus_height) + + if provider_heights != [] do + max_height = + provider_heights + |> Enum.map(& &1[:height]) + |> Enum.reject(&is_nil/1) + |> Enum.max(fn -> nil end) + + min_lag = + provider_heights + |> Enum.map(& &1[:lag]) + |> Enum.reject(&is_nil/1) + |> Enum.min(fn -> 0 end) + + {max_height, min_lag, consensus} + else + {region_data[:block_height] || Map.get(conn, :block_height), region_data[:block_lag] || 0, + consensus} + end + end + + defp resolve_block_heights( + selected_region, + region_data, + conn, + cluster_block_heights, + provider_id, + chain_consensus + ) do + region_height_data = Map.get(cluster_block_heights, {provider_id, selected_region}, %{}) + + { + region_height_data[:height] || region_data[:block_height] || Map.get(conn, :block_height), + region_height_data[:lag] || region_data[:block_lag] || 0, + chain_consensus || region_data[:consensus_height] || Map.get(conn, :consensus_height) + } + end + + defp resolve_effective_lag(block_lag, optimistic_lag, conn) do + cond do + is_integer(block_lag) and block_lag > 0 -> block_lag + optimistic_lag != nil -> abs(min(0, optimistic_lag)) + true -> Map.get(conn, :blocks_behind, 0) || 0 + end + end + + defp compute_chain_consensus(cluster_block_heights, chain, connections) + when is_binary(chain) do + chain_provider_ids = + connections + |> Enum.filter(&(&1.chain == chain)) + |> MapSet.new(& &1.id) + + cluster_block_heights + |> Enum.filter(fn {{pid, _node}, _} -> pid in chain_provider_ids end) + |> Enum.map(fn {_, %{height: h}} -> h end) + |> Enum.max(fn -> nil end) + end + + defp compute_chain_consensus(_, _, _), do: nil + defp compute_metrics_data(region_data, cached_metrics) do # Prefer live data for real-time metrics, fall back to cached # Note: region_data comes from aggregator (real-time events) @@ -530,7 +570,12 @@ defmodule LassoWeb.Dashboard.Components.ProviderDetailsPanel do ~H""" - +
+

Performance

+ + Profile + +
<:metric label="Latency p50" value={@p50} /> <:metric label="Latency p95" value={@p95} /> @@ -1005,50 +1050,6 @@ defmodule LassoWeb.Dashboard.Components.ProviderDetailsPanel do # --- Helper Functions --- - defp calculate_optimistic_lag(chain, provider_id) - when is_binary(chain) and is_binary(provider_id) do - with {:ok, {height, timestamp, _source, _meta}} <- - BlockSyncRegistry.get_height(chain, provider_id), - {:ok, consensus} <- ChainState.consensus_height(chain) do - block_time_ms = get_block_time_ms(chain) - now = System.system_time(:millisecond) - elapsed_ms = now - timestamp - raw_lag = height - consensus - - staleness_credit = - if block_time_ms > 0 do - div(elapsed_ms, block_time_ms) - else - 0 - end - - max_credit = div(30_000, max(block_time_ms, 1)) - capped_credit = min(staleness_credit, max_credit) - - optimistic_height = height + capped_credit - optimistic_lag = optimistic_height - consensus - - {optimistic_lag, raw_lag} - else - _ -> {nil, nil} - end - end - - defp calculate_optimistic_lag(_, _), do: {nil, nil} - - defp get_block_time_ms(chain) do - case ConfigStore.list_profiles_for_chain(chain) do - [profile | _] -> - case ConfigStore.get_chain(profile, chain) do - {:ok, config} -> config.block_time_ms || 12_000 - _ -> 12_000 - end - - [] -> - 12_000 - end - end - defp sync_status_level(blocks_behind) when blocks_behind <= 2, do: :healthy defp sync_status_level(blocks_behind) when blocks_behind <= 10, do: :degraded defp sync_status_level(_), do: :down @@ -1078,32 +1079,17 @@ defmodule LassoWeb.Dashboard.Components.ProviderDetailsPanel do defp sync_progress(block_height, consensus_height), do: min(100, block_height / consensus_height * 100) - defp format_latency(nil), do: "—" - defp format_latency(ms), do: "#{round(ms)}ms" + defdelegate format_latency(ms), to: Formatting + defdelegate format_time_ago(ts_ms), to: Formatting + defdelegate success_rate_color(rate), to: Formatting defp format_traffic(nil), do: "—" defp format_traffic(value), do: "#{value |> Helpers.to_float() |> Float.round(1)}%" - defp success_rate_color(rate) when rate >= 99.0, do: "text-emerald-400" - defp success_rate_color(rate) when rate >= 95.0, do: "text-yellow-400" - defp success_rate_color(_), do: "text-red-400" - defp severity_dot_color(:error), do: "bg-red-500" defp severity_dot_color(:warn), do: "bg-yellow-500" defp severity_dot_color(_), do: "bg-blue-500" - defp format_time_ago(nil), do: "—" - - defp format_time_ago(ts_ms) do - diff_ms = System.system_time(:millisecond) - ts_ms - - cond do - diff_ms < 60_000 -> "now" - diff_ms < 3_600_000 -> "#{div(diff_ms, 60_000)}m ago" - true -> "#{div(diff_ms, 3_600_000)}h ago" - end - end - defp truncate_message(message, max_length) when byte_size(message) > max_length do String.slice(message, 0, max_length - 3) <> "..." end diff --git a/lib/lasso_web/dashboard/endpoint_helpers.ex b/lib/lasso_web/dashboard/endpoint_helpers.ex index 1408902c..c88d939d 100644 --- a/lib/lasso_web/dashboard/endpoint_helpers.ex +++ b/lib/lasso_web/dashboard/endpoint_helpers.ex @@ -112,11 +112,11 @@ defmodule LassoWeb.Dashboard.EndpointHelpers do defp extract_chain_name(%{chain: chain}) when is_binary(chain), do: chain defp extract_chain_name(_), do: "ethereum" - # Strategy display names - defp strategy_display_name("round-robin"), do: "Load Balanced" - defp strategy_display_name("latency-weighted"), do: "Latency Weighted" - defp strategy_display_name("fastest"), do: "Fastest" - defp strategy_display_name(other), do: other |> String.replace("-", " ") |> String.capitalize() + @doc "Get display name for a strategy" + def strategy_display_name("round-robin"), do: "Load Balanced" + def strategy_display_name("latency-weighted"), do: "Latency Weighted" + def strategy_display_name("fastest"), do: "Fastest" + def strategy_display_name(other), do: other |> String.replace("-", " ") |> String.capitalize() # Strategy icons defp strategy_icon("fastest"), do: "⚡" @@ -124,18 +124,18 @@ defmodule LassoWeb.Dashboard.EndpointHelpers do defp strategy_icon("latency-weighted"), do: "⚖️" defp strategy_icon(_), do: "🎯" - # Strategy descriptions - defp strategy_description("round-robin") do + @doc "Get description for a strategy" + def strategy_description("round-robin") do "Distributes requests evenly across all available providers — good for general purpose workloads" end - defp strategy_description("latency-weighted") do + def strategy_description("latency-weighted") do "Load balanced favoring faster providers — good for high-throughput workloads like indexing and backfilling" end - defp strategy_description("fastest") do + def strategy_description("fastest") do "Routes all requests to the single fastest provider — best suited for low-volume, latency-sensitive calls" end - defp strategy_description(_), do: "Strategy-based routing" + def strategy_description(_), do: "Strategy-based routing" end diff --git a/lib/lasso_web/dashboard/formatting.ex b/lib/lasso_web/dashboard/formatting.ex index dd546a2a..24d5df41 100644 --- a/lib/lasso_web/dashboard/formatting.ex +++ b/lib/lasso_web/dashboard/formatting.ex @@ -89,4 +89,57 @@ defmodule LassoWeb.Dashboard.Formatting do def safe_round(value, _precision) when is_integer(value), do: value def safe_round(value, precision) when is_float(value), do: Float.round(value, precision) def safe_round(nil, _precision), do: nil + + @doc """ + Formats a raw node/region ID into a human-readable display name. + + Extracts the lowercase prefix before the first dash and looks it up + in the `:region_display_names` config. Returns the original string + if no mapping is found. + + ## Examples + + iex> Formatting.format_region_name("Sjc-080713ea67e778") + "San Jose" + + iex> Formatting.format_region_name("unknown-region") + "unknown-region" + """ + def format_region_name(region) when is_binary(region) do + prefix = + region + |> String.downcase() + |> String.split("-", parts: 2) + |> List.first() + + display_names = Application.get_env(:lasso, :region_display_names, %{}) + Map.get(display_names, prefix, region) + end + + def format_region_name(region), do: region + + def format_latency(nil), do: "—" + def format_latency(ms) when is_float(ms), do: "#{round(ms)}ms" + def format_latency(ms), do: "#{ms}ms" + + def format_rps(rps) when rps == 0 or rps == 0.0, do: "0" + def format_rps(rps), do: "#{rps}" + + def format_time_ago(nil), do: "—" + + def format_time_ago(ts_ms) do + diff_ms = System.system_time(:millisecond) - ts_ms + + cond do + diff_ms < 60_000 -> "now" + diff_ms < 3_600_000 -> "#{div(diff_ms, 60_000)}m ago" + true -> "#{div(diff_ms, 3_600_000)}h ago" + end + end + + @doc "Tailwind color class for success rate percentage (0-100 scale)." + def success_rate_color(nil), do: "text-gray-500" + def success_rate_color(rate) when rate >= 99.0, do: "text-emerald-400" + def success_rate_color(rate) when rate >= 95.0, do: "text-yellow-400" + def success_rate_color(_), do: "text-red-400" end diff --git a/lib/lasso_web/dashboard/helpers.ex b/lib/lasso_web/dashboard/helpers.ex index fb3e799f..1b6eaad0 100644 --- a/lib/lasso_web/dashboard/helpers.ex +++ b/lib/lasso_web/dashboard/helpers.ex @@ -138,24 +138,4 @@ defmodule LassoWeb.Dashboard.Helpers do meta: Keyword.get(opts, :meta, %{}) } end - - @doc "Get strategy description" - def get_strategy_description(strategy) do - case strategy do - "fastest" -> - "Routes all requests to the single fastest provider — best suited for low-volume, latency-sensitive calls" - - "priority" -> - "Routes by configured provider priority order" - - "round-robin" -> - "Distributes requests evenly across all available providers — good for general purpose workloads" - - "latency-weighted" -> - "Load balanced favoring faster providers — good for high-throughput workloads like indexing and backfilling" - - _ -> - "Strategy-based routing" - end - end end diff --git a/lib/lasso_web/dashboard/status_helpers.ex b/lib/lasso_web/dashboard/status_helpers.ex index c879fd33..2f32bfdd 100644 --- a/lib/lasso_web/dashboard/status_helpers.ex +++ b/lib/lasso_web/dashboard/status_helpers.ex @@ -149,14 +149,16 @@ defmodule LassoWeb.Dashboard.StatusHelpers do def check_block_lag(_chain, _provider_id), do: :unavailable - # Calculate optimistic lag that accounts for observation delay. - # - # With HTTP polling always running (see BlockSync.Worker), the registry - # always has reasonably fresh data. This formula credits providers for - # blocks that likely arrived since the last observation. - # - # The 30s cap prevents runaway values in edge cases. - defp calculate_optimistic_lag(chain, provider_id) do + @doc """ + Calculate optimistic lag that accounts for observation delay. + + With HTTP polling always running (see BlockSync.Worker), the registry + always has reasonably fresh data. This formula credits providers for + blocks that likely arrived since the last observation. + + The 30s cap prevents runaway values in edge cases. + """ + def calculate_optimistic_lag(chain, provider_id) do with {:ok, {height, timestamp, _source, _meta}} <- BlockSyncRegistry.get_height(chain, provider_id), {:ok, consensus} <- ChainState.consensus_height(chain) do @@ -184,8 +186,10 @@ defmodule LassoWeb.Dashboard.StatusHelpers do end end - # Get block_time_ms for a chain. Prefers dynamic measurement, falls back to config. - defp get_block_time_ms(chain) do + @doc """ + Get block_time_ms for a chain. Prefers dynamic measurement, falls back to config. + """ + def get_block_time_ms(chain) do case BlockSyncRegistry.get_block_time_ms(chain) do ms when is_integer(ms) and ms > 0 -> ms diff --git a/lib/lasso_web/router.ex b/lib/lasso_web/router.ex index 62fad6a2..8edf0550 100644 --- a/lib/lasso_web/router.ex +++ b/lib/lasso_web/router.ex @@ -39,7 +39,6 @@ defmodule LassoWeb.Router do # Chain endpoints get("/chains", ChainController, :index) - get("/chains/:chain_id/status", ChainController, :status) end # HTTP JSON-RPC endpoints diff --git a/test/integration/health_probe_integration_test.exs b/test/integration/health_probe_integration_test.exs index 047b9c60..51a6a3d7 100644 --- a/test/integration/health_probe_integration_test.exs +++ b/test/integration/health_probe_integration_test.exs @@ -506,7 +506,7 @@ defmodule Lasso.Integration.HealthProbeIntegrationTest do describe "rate limit handling" do @tag :integration - test "rate limit opens circuit with lower threshold", %{chain: chain} do + test "rate limit does not open circuit (handled by RateLimitState tiering)", %{chain: chain} do profile = "default" provider_spec = %{ @@ -526,23 +526,17 @@ defmodule Lasso.Integration.HealthProbeIntegrationTest do cb_id = {profile, chain, provider_id, :http} wait_for_circuit_breaker(cb_id) - # Rate limit errors should open circuit faster (threshold: 2) rate_limit_error = {:rate_limit, %{retry_after: 60}} - # First rate limit CircuitBreaker.call(cb_id, fn -> {:error, rate_limit_error} end) Process.sleep(20) - # May still be closed after 1 (depends on threshold) - _state = CircuitBreaker.get_state(cb_id) - - # Second rate limit CircuitBreaker.call(cb_id, fn -> {:error, rate_limit_error} end) Process.sleep(50) - # Should be open now (rate limit threshold is 2) + # Circuit should remain closed — rate limits don't trip circuit breakers state = CircuitBreaker.get_state(cb_id) - assert state.state == :open + assert state.state == :closed end end diff --git a/test/integration/request_pipeline_integration_test.exs b/test/integration/request_pipeline_integration_test.exs index 209b641d..3a761093 100644 --- a/test/integration/request_pipeline_integration_test.exs +++ b/test/integration/request_pipeline_integration_test.exs @@ -631,7 +631,6 @@ defmodule Lasso.RPC.RequestPipelineIntegrationTest do test "rate-limited provider triggers automatic failover to healthy provider", %{chain: chain} do profile = "default" - # Setup: Primary provider that rate limits, backup that's healthy rate_limit_error = %Lasso.JSONRPC.Error{ code: 429, @@ -650,11 +649,10 @@ defmodule Lasso.RPC.RequestPipelineIntegrationTest do %{id: "healthy_backup", priority: 20, behavior: :healthy, profile: profile} ]) - # Ensure circuit breakers exist CircuitBreakerHelper.ensure_circuit_breaker_started(profile, chain, "rate_limited", :http) CircuitBreakerHelper.ensure_circuit_breaker_started(profile, chain, "healthy_backup", :http) - # CRITICAL TEST: Request should automatically failover to healthy backup + # Request should failover to healthy backup via normal retry logic {:ok, result, _ctx} = RequestPipeline.execute_via_channels( chain, @@ -663,13 +661,11 @@ defmodule Lasso.RPC.RequestPipelineIntegrationTest do %RequestOptions{strategy: :priority, timeout_ms: 30_000} ) - # Verify we got a successful response (failover worked) assert %Response.Success{} = result {:ok, block_number} = Response.Success.decode_result(result) assert String.starts_with?(block_number, "0x") - # To trigger circuit breaker opening, we need to directly target the rate-limited - # provider without failover, so the circuit breaker sees the failures + # Send rate limit errors directly to the rate-limited provider for _ <- 1..2 do {:error, _, _} = RequestPipeline.execute_via_channels( @@ -686,24 +682,13 @@ defmodule Lasso.RPC.RequestPipelineIntegrationTest do Process.sleep(100) end - # Wait for circuit breaker to process the failures and open Process.sleep(500) - # Wait for circuit breaker to open (rate limit threshold is 2) + # Rate limits are handled by RateLimitState tiering, not circuit breakers breaker_id = {profile, chain, "rate_limited", :http} + CircuitBreakerHelper.assert_circuit_breaker_state(breaker_id, :closed) - {:ok, _state} = - CircuitBreakerHelper.wait_for_circuit_breaker_state( - breaker_id, - fn state -> state.state == :open end, - timeout: 10_000, - interval: 100 - ) - - # Verify rate-limited provider's circuit is now open - CircuitBreakerHelper.assert_circuit_breaker_state(breaker_id, :open) - - # Now verify that subsequent requests with priority strategy use backup + # Subsequent requests still succeed via failover {:ok, result2, _ctx2} = RequestPipeline.execute_via_channels( chain, @@ -712,14 +697,14 @@ defmodule Lasso.RPC.RequestPipelineIntegrationTest do %RequestOptions{strategy: :priority, timeout_ms: 30_000} ) - # Should still succeed using backup since primary's circuit is open assert %Response.Success{} = result2 end - test "rate limit error opens circuit breaker faster than normal errors", %{chain: chain} do + test "rate limit errors do not open circuit breaker (handled by RateLimitState tiering)", %{ + chain: chain + } do profile = "default" - # Setup provider that rate limits rate_limit_error = %Lasso.JSONRPC.Error{ code: 429, @@ -744,7 +729,6 @@ defmodule Lasso.RPC.RequestPipelineIntegrationTest do :http ) - # Execute 2 requests (rate limit threshold is 2, vs 5 for normal errors) for _ <- 1..2 do {:error, _error, _ctx} = RequestPipeline.execute_via_channels( @@ -761,20 +745,20 @@ defmodule Lasso.RPC.RequestPipelineIntegrationTest do Process.sleep(50) end - # Give circuit breaker time to open Process.sleep(500) - # Verify circuit opened after only 2 rate limit errors (not 5) + # Circuit should remain closed — rate limits don't trip circuit breakers CircuitBreakerHelper.assert_circuit_breaker_state( {profile, chain, "rate_limited_fast", :http}, - :open + :closed ) end - test "multiple providers can be rate-limited independently", %{chain: chain} do + test "multiple providers can be rate-limited independently without opening circuits", %{ + chain: chain + } do profile = "default" - # Setup: Multiple providers, all rate-limited rate_limit_error = %Lasso.JSONRPC.Error{ code: 429, @@ -789,7 +773,6 @@ defmodule Lasso.RPC.RequestPipelineIntegrationTest do %{id: "provider_c", priority: 30, behavior: :healthy, profile: profile} ]) - # Ensure circuit breakers exist for provider_id <- ["provider_a", "provider_b", "provider_c"] do CircuitBreakerHelper.ensure_circuit_breaker_started(profile, chain, provider_id, :http) end @@ -830,15 +813,15 @@ defmodule Lasso.RPC.RequestPipelineIntegrationTest do Process.sleep(300) - # Verify both a and b are open, but c is still closed + # All circuits remain closed — rate limits are handled by RateLimitState tiering CircuitBreakerHelper.assert_circuit_breaker_state( {profile, chain, "provider_a", :http}, - :open + :closed ) CircuitBreakerHelper.assert_circuit_breaker_state( {profile, chain, "provider_b", :http}, - :open + :closed ) CircuitBreakerHelper.assert_circuit_breaker_state( @@ -846,7 +829,7 @@ defmodule Lasso.RPC.RequestPipelineIntegrationTest do :closed ) - # Request with priority strategy should use provider_c (only healthy one) + # Request with priority strategy should still succeed via failover {:ok, result, _ctx} = RequestPipeline.execute_via_channels( chain, diff --git a/test/lasso/core/support/error_classification_test.exs b/test/lasso/core/support/error_classification_test.exs index 2d354f63..4b894f95 100644 --- a/test/lasso/core/support/error_classification_test.exs +++ b/test/lasso/core/support/error_classification_test.exs @@ -123,9 +123,9 @@ defmodule Lasso.RPC.ErrorClassificationTest do assert penalty == false end - test "rate limits do penalize circuit breaker" do + test "rate limits do not penalize circuit breaker" do penalty = ErrorClassification.breaker_penalty?(:rate_limit) - assert penalty == true + assert penalty == false end test "server errors do penalize circuit breaker" do diff --git a/test/lasso/jsonrpc/error_test.exs b/test/lasso/jsonrpc/error_test.exs new file mode 100644 index 00000000..cdc8ab0b --- /dev/null +++ b/test/lasso/jsonrpc/error_test.exs @@ -0,0 +1,69 @@ +defmodule Lasso.JSONRPC.ErrorTest do + use ExUnit.Case, async: true + + alias Lasso.JSONRPC.Error, as: JError + + describe "new/3 nil-aware defaults" do + test "explicit breaker_penalty?: false is preserved (not overridden by ||)" do + jerr = JError.new(-32_005, "Rate limited", breaker_penalty?: false) + assert jerr.breaker_penalty? == false + end + + test "explicit retriable?: false is preserved" do + jerr = JError.new(-32_005, "Rate limited", retriable?: false) + assert jerr.retriable? == false + end + + test "explicit breaker_penalty?: true is preserved" do + jerr = JError.new(-32_602, "Invalid params", breaker_penalty?: true) + assert jerr.breaker_penalty? == true + end + + test "explicit retriable?: true is preserved" do + jerr = JError.new(-32_602, "Invalid params", retriable?: true) + assert jerr.retriable? == true + end + + test "omitted breaker_penalty? falls through to ErrorClassification" do + # :rate_limit category → breaker_penalty? false (after our fix) + jerr = JError.new(-32_005, "Rate limited") + assert jerr.category == :rate_limit + assert jerr.breaker_penalty? == false + end + + test "omitted retriable? falls through to ErrorClassification" do + # -32_602 is :invalid_params → retriable? false + jerr = JError.new(-32_602, "Invalid params") + assert jerr.retriable? == false + end + + test "rate limit error has correct classification" do + jerr = JError.new(-32_005, "Rate limited") + assert jerr.category == :rate_limit + assert jerr.retriable? == true + assert jerr.breaker_penalty? == false + end + + test "capability violation has correct classification" do + jerr = JError.new(-32_000, "block range too large") + assert jerr.category == :capability_violation + assert jerr.retriable? == true + assert jerr.breaker_penalty? == false + end + + test "server error has correct classification" do + jerr = JError.new(-32_000, "Internal server error") + assert jerr.category == :server_error + assert jerr.retriable? == true + assert jerr.breaker_penalty? == true + end + + test "HTTP 429 is normalized to -32_005 rate limit" do + jerr = JError.new(429, "Too Many Requests") + assert jerr.code == -32_005 + assert jerr.original_code == 429 + assert jerr.category == :rate_limit + assert jerr.breaker_penalty? == false + end + end +end diff --git a/test/lasso/rpc/circuit_breaker_test.exs b/test/lasso/rpc/circuit_breaker_test.exs index d8703e7a..368da19e 100644 --- a/test/lasso/rpc/circuit_breaker_test.exs +++ b/test/lasso/rpc/circuit_breaker_test.exs @@ -122,7 +122,7 @@ defmodule Lasso.RPC.CircuitBreakerTest do assert CircuitBreaker.get_state(id).state == :closed end - test "rate limit errors open circuit after 2 failures (lower threshold)" do + test "rate limit errors do not open circuit (handled by RateLimitState tiering)" do alias Lasso.JSONRPC.Error, as: JError id = {"default", "test_chain", "cb_rate_limit", :http} @@ -131,7 +131,6 @@ defmodule Lasso.RPC.CircuitBreakerTest do {id, %{failure_threshold: 5, recovery_timeout: 100, success_threshold: 2}} ) - # Rate limit error should use threshold of 2, not 5 rate_limit_error = JError.new(-32_005, "Rate limited", category: :rate_limit) assert {:executed, {:error, _}} = @@ -142,7 +141,7 @@ defmodule Lasso.RPC.CircuitBreakerTest do Process.sleep(20) state = CircuitBreaker.get_state(id) - assert state.state == :open, "Circuit should open after 2 rate limit errors" + assert state.state == :closed, "Circuit should remain closed for rate limit errors" end test "server errors use default threshold of 5" do @@ -165,7 +164,7 @@ defmodule Lasso.RPC.CircuitBreakerTest do assert state.state == :closed, "Circuit should remain closed after 2 server errors" end - test "retry-after header adjusts recovery timeout for rate limits" do + test "rate limit errors with retry-after do not open circuit" do alias Lasso.JSONRPC.Error, as: JError id = {"default", "test_chain", "cb_retry_after", :http} @@ -174,8 +173,6 @@ defmodule Lasso.RPC.CircuitBreakerTest do {id, %{failure_threshold: 2, recovery_timeout: 60_000, success_threshold: 2}} ) - # Rate limit error with retry-after in data (populated by ErrorNormalizer) - # :retry_after_ms is in milliseconds rate_limit_error = JError.new(-32_005, "Rate limited", category: :rate_limit, @@ -190,22 +187,14 @@ defmodule Lasso.RPC.CircuitBreakerTest do Process.sleep(20) state = CircuitBreaker.get_state(id) - assert state.state == :open - - # Should use 2 second timeout instead of default 60 seconds - # Wait 2.1 seconds and verify circuit attempts recovery - Process.sleep(2100) - result = CircuitBreaker.call(id, fn -> {:ok, :success} end) - - assert match?({:executed, {:ok, :success}}, result), - "Circuit should attempt recovery after 2 seconds" + assert state.state == :closed, "Rate limits should not open circuit" end test "custom category thresholds can be configured" do alias Lasso.JSONRPC.Error, as: JError id = {"default", "test_chain", "cb_custom_threshold", :http} - # Override rate_limit threshold to 3 instead of default 2 + # Override auth_error threshold to 3 instead of default 2 {:ok, _pid} = CircuitBreaker.start_link( {id, @@ -213,18 +202,23 @@ defmodule Lasso.RPC.CircuitBreakerTest do failure_threshold: 5, recovery_timeout: 100, success_threshold: 2, - category_thresholds: %{rate_limit: 3} + category_thresholds: %{auth_error: 3} }} ) - rate_limit_error = JError.new(-32_005, "Rate limited", category: :rate_limit) + auth_error = + JError.new(-32_000, "Unauthorized", + category: :auth_error, + retriable?: true, + breaker_penalty?: true + ) # Should not open after 2 failures (needs 3 now) assert {:executed, {:error, _}} = - CircuitBreaker.call(id, fn -> {:error, rate_limit_error} end) + CircuitBreaker.call(id, fn -> {:error, auth_error} end) assert {:executed, {:error, _}} = - CircuitBreaker.call(id, fn -> {:error, rate_limit_error} end) + CircuitBreaker.call(id, fn -> {:error, auth_error} end) Process.sleep(20) state = CircuitBreaker.get_state(id) @@ -232,7 +226,7 @@ defmodule Lasso.RPC.CircuitBreakerTest do # Should open after 3rd failure assert {:executed, {:error, _}} = - CircuitBreaker.call(id, fn -> {:error, rate_limit_error} end) + CircuitBreaker.call(id, fn -> {:error, auth_error} end) Process.sleep(20) state = CircuitBreaker.get_state(id) diff --git a/test/lasso/rpc/provider_pool_probe_health_test.exs b/test/lasso/rpc/provider_pool_probe_health_test.exs new file mode 100644 index 00000000..65db8023 --- /dev/null +++ b/test/lasso/rpc/provider_pool_probe_health_test.exs @@ -0,0 +1,179 @@ +defmodule Lasso.RPC.ProviderPoolProbeHealthTest do + use ExUnit.Case, async: false + import Mox + + alias Lasso.RPC.ProviderPool + alias Lasso.Config.ChainConfig + + setup_all do + TestHelper.ensure_test_environment_ready() + :ok + end + + setup :verify_on_exit! + + setup do + stub(Lasso.RPC.HttpClientMock, :request, fn _endpoint, method, _params, _timeout -> + case method do + "eth_chainId" -> {:ok, "0x1"} + _ -> {:error, :method_not_mocked} + end + end) + + :ok + end + + defp provider_struct(attrs) do + struct( + ChainConfig.Provider, + Map.merge( + %{ + id: "test_provider", + name: "Test Provider", + priority: 1, + url: "http://example", + ws_url: nil + }, + attrs + ) + ) + end + + defp base_chain_config(providers) do + %{ + aggregation: %{max_providers: 5}, + global: %{ + health_check: %{ + interval: 2000, + timeout: 5_000, + failure_threshold: 2, + recovery_threshold: 1 + }, + provider_management: %{load_balancing: "priority"} + }, + providers: providers + } + end + + describe "update_probe_health/5" do + test "probe success transitions :connecting to :healthy" do + p = provider_struct(%{id: "probe_success_1", name: "P1"}) + chain_config = base_chain_config([p]) + chain = "probe_health_test_1" + + {:ok, _pid} = ProviderPool.start_link({"default", chain, chain_config}) + :ok = ProviderPool.register_provider("default", chain, p.id, p) + + # Verify initial state is :connecting + {:ok, status} = ProviderPool.get_status("default", chain) + provider = Enum.find(status.providers, &(&1.id == p.id)) + assert provider.http_status == :connecting + + # Send probe success + ProviderPool.update_probe_health("default", chain, p.id, :http, :success) + Process.sleep(50) + + # Should transition to :healthy + {:ok, status} = ProviderPool.get_status("default", chain) + provider = Enum.find(status.providers, &(&1.id == p.id)) + assert provider.http_status == :healthy + end + + test "probe failure transitions :connecting to :degraded" do + p = provider_struct(%{id: "probe_fail_1", name: "P1"}) + chain_config = base_chain_config([p]) + chain = "probe_health_test_2" + + {:ok, _pid} = ProviderPool.start_link({"default", chain, chain_config}) + :ok = ProviderPool.register_provider("default", chain, p.id, p) + + # Send probe failure + ProviderPool.update_probe_health("default", chain, p.id, :http, {:failure, :timeout}) + Process.sleep(50) + + {:ok, status} = ProviderPool.get_status("default", chain) + provider = Enum.find(status.providers, &(&1.id == p.id)) + assert provider.http_status in [:degraded, :unhealthy] + end + + test "probe success does not override live-traffic :healthy status" do + p = provider_struct(%{id: "probe_noop_1", name: "P1"}) + chain_config = base_chain_config([p]) + chain = "probe_health_test_3" + + {:ok, _pid} = ProviderPool.start_link({"default", chain, chain_config}) + :ok = ProviderPool.register_provider("default", chain, p.id, p) + + # Transition to healthy via live traffic success + ProviderPool.report_success("default", chain, p.id, :http) + Process.sleep(50) + + {:ok, status} = ProviderPool.get_status("default", chain) + provider = Enum.find(status.providers, &(&1.id == p.id)) + assert provider.http_status == :healthy + old_check = provider.last_health_check + + # Probe success should only update last_health_check, not change status + Process.sleep(10) + ProviderPool.update_probe_health("default", chain, p.id, :http, :success) + Process.sleep(50) + + {:ok, status} = ProviderPool.get_status("default", chain) + provider = Enum.find(status.providers, &(&1.id == p.id)) + assert provider.http_status == :healthy + assert provider.last_health_check >= old_check + end + + test "probe failure does not override live-traffic :degraded status" do + p = provider_struct(%{id: "probe_noop_2", name: "P1"}) + chain_config = base_chain_config([p]) + chain = "probe_health_test_4" + + {:ok, _pid} = ProviderPool.start_link({"default", chain, chain_config}) + :ok = ProviderPool.register_provider("default", chain, p.id, p) + + # Transition to degraded via live traffic failure + ProviderPool.report_failure("default", chain, p.id, {:network_error, "timeout"}, :http) + Process.sleep(50) + + {:ok, status} = ProviderPool.get_status("default", chain) + provider = Enum.find(status.providers, &(&1.id == p.id)) + assert provider.http_status in [:degraded, :unhealthy] + old_status = provider.http_status + + # Probe failure should not change existing live-traffic-derived status + ProviderPool.update_probe_health("default", chain, p.id, :http, {:failure, :timeout}) + Process.sleep(50) + + {:ok, status} = ProviderPool.get_status("default", chain) + provider = Enum.find(status.providers, &(&1.id == p.id)) + assert provider.http_status == old_status + end + + test "probe failure from :connecting transitions to :degraded (probes resolve stuck state only)" do + p = provider_struct(%{id: "probe_multi_fail", name: "P1"}) + chain_config = base_chain_config([p]) + chain = "probe_health_test_5" + + {:ok, _pid} = ProviderPool.start_link({"default", chain, chain_config}) + :ok = ProviderPool.register_provider("default", chain, p.id, p) + + # First failure transitions :connecting → :degraded + ProviderPool.update_probe_health("default", chain, p.id, :http, {:failure, :timeout}) + Process.sleep(50) + + {:ok, status} = ProviderPool.get_status("default", chain) + provider = Enum.find(status.providers, &(&1.id == p.id)) + assert provider.http_status == :degraded + + # Subsequent probe failures don't further degrade — probes only resolve :connecting, + # live traffic drives full health lifecycle + ProviderPool.update_probe_health("default", chain, p.id, :http, {:failure, :timeout}) + Process.sleep(50) + + {:ok, status} = ProviderPool.get_status("default", chain) + provider = Enum.find(status.providers, &(&1.id == p.id)) + assert provider.http_status == :degraded + end + end +end diff --git a/test/lasso/rpc/selection_rate_limit_tiering_test.exs b/test/lasso/rpc/selection_rate_limit_tiering_test.exs new file mode 100644 index 00000000..cfb19976 --- /dev/null +++ b/test/lasso/rpc/selection_rate_limit_tiering_test.exs @@ -0,0 +1,140 @@ +defmodule Lasso.RPC.SelectionRateLimitTieringTest do + @moduledoc """ + Tests that rate-limited providers are deprioritized (placed last) in selection, + not excluded entirely. + """ + + use ExUnit.Case, async: false + import Mox + + alias Lasso.JSONRPC.Error, as: JError + alias Lasso.RPC.ProviderPool + alias Lasso.Config.ChainConfig + + setup_all do + TestHelper.ensure_test_environment_ready() + :ok + end + + setup :verify_on_exit! + + setup do + stub(Lasso.RPC.HttpClientMock, :request, fn _endpoint, method, _params, _timeout -> + case method do + "eth_chainId" -> {:ok, "0x1"} + "eth_blockNumber" -> {:ok, "0x100"} + _ -> {:error, :method_not_mocked} + end + end) + + :ok + end + + defp provider_struct(attrs) do + struct( + ChainConfig.Provider, + Map.merge( + %{ + id: "test_provider", + name: "Test Provider", + priority: 1, + url: "http://example" + }, + attrs + ) + ) + end + + defp base_chain_config(providers) do + %{ + aggregation: %{max_providers: 5}, + global: %{ + health_check: %{ + interval: 2000, + timeout: 5_000, + failure_threshold: 2, + recovery_threshold: 1 + }, + provider_management: %{load_balancing: "priority"} + }, + providers: providers + } + end + + describe "rate limit tiering in list_candidates" do + test "rate-limited providers appear in candidates with rate_limited flag" do + p1 = provider_struct(%{id: "rl_tier_p1", name: "P1", priority: 1}) + p2 = provider_struct(%{id: "rl_tier_p2", name: "P2", priority: 2}) + chain = "rl_tiering_test_1" + chain_config = base_chain_config([p1, p2]) + + {:ok, _pid} = ProviderPool.start_link({"default", chain, chain_config}) + + # Make providers healthy + ProviderPool.report_success("default", chain, p1.id, :http) + ProviderPool.report_success("default", chain, p2.id, :http) + Process.sleep(50) + + # Rate limit p1 via a rate limit error + rate_limit_err = + JError.new(-32_005, "Rate limited", + category: :rate_limit, + data: %{retry_after_ms: 30_000} + ) + + ProviderPool.report_failure("default", chain, p1.id, rate_limit_err, :http) + Process.sleep(50) + + # Both providers should appear in candidates + candidates = + ProviderPool.list_candidates("default", chain, %{ + protocol: :http, + include_half_open: true + }) + + ids = Enum.map(candidates, & &1.id) + assert p1.id in ids, "Rate-limited provider should still be a candidate" + assert p2.id in ids + + # p1 should be flagged as rate-limited + p1_candidate = Enum.find(candidates, &(&1.id == p1.id)) + assert p1_candidate.rate_limited.http == true + + # p2 should not be rate-limited + p2_candidate = Enum.find(candidates, &(&1.id == p2.id)) + assert p2_candidate.rate_limited.http == false + end + + test "rate-limited providers are not excluded from selection" do + p1 = provider_struct(%{id: "rl_tier_only_p1", name: "Only Provider", priority: 1}) + chain = "rl_tiering_test_2" + chain_config = base_chain_config([p1]) + + {:ok, _pid} = ProviderPool.start_link({"default", chain, chain_config}) + + # Make healthy then rate limit + ProviderPool.report_success("default", chain, p1.id, :http) + Process.sleep(50) + + rate_limit_err = + JError.new(-32_005, "Rate limited", + category: :rate_limit, + data: %{retry_after_ms: 30_000} + ) + + ProviderPool.report_failure("default", chain, p1.id, rate_limit_err, :http) + Process.sleep(50) + + # Even though p1 is rate-limited, it should still be a candidate + # (prevents "no candidates" when all providers are rate-limited) + candidates = + ProviderPool.list_candidates("default", chain, %{ + protocol: :http, + include_half_open: true + }) + + assert length(candidates) == 1 + assert hd(candidates).id == p1.id + end + end +end From 60e92dc6d5650f5883f046ec1afc7b34ee2cfd28 Mon Sep 17 00:00:00 2001 From: "Jackson Ernst - jaxer.eth" <51183683+jaxernst@users.noreply.github.com> Date: Tue, 10 Feb 2026 15:41:43 -0800 Subject: [PATCH 2/2] Sync from lasso-cloud: error handling fixes, load_balanced strategy rename, graceful shutdown Key changes: - Rename round_robin strategy to load_balanced across routes, controllers, UI, and config - Add backward-compatible round-robin route aliases - Add Plug.Cowboy.Drainer for graceful HTTP request draining on shutdown - Fix error handling: probe classification, client_error failover, 4xx reclassification - Fix providers stuck unhealthy: graduated recovery, error exclusion, block range pre-filtering - Update provider adapters (dRPC URL fix, merkle, llamarpc, 1rpc, generic) - Add load_balanced strategy implementation - Add new test coverage for error normalizer, failover strategy, probe classification - Update docs: API reference, architecture, configuration, observability, routing Synced from cloud commits 9f19dfc..2826269. Co-Authored-By: Claude Opus 4.6 --- CHANGELOG.md | 2 +- README.md | 4 +- assets/js/app.js | 4 +- config/config.exs | 4 +- config/profiles/default.yml | 213 +++++++++++++++- docs/API_REFERENCE.md | 6 +- docs/ARCHITECTURE.md | 12 +- docs/CONFIGURATION.md | 19 +- docs/FUTURE_FEATURES.md | 2 +- docs/OBSERVABILITY.md | 9 +- docs/ROUTING.md | 240 ++++++++++++++++++ lib/lasso/application.ex | 6 +- lib/lasso/config/runtime_config.ex | 2 +- .../core/health_probe/batch_coordinator.ex | 11 +- lib/lasso/core/providers/adapter_helpers.ex | 78 +++--- lib/lasso/core/providers/adapters/1rpc.ex | 49 +--- lib/lasso/core/providers/adapters/alchemy.ex | 51 +--- lib/lasso/core/providers/adapters/drpc.ex | 49 +--- lib/lasso/core/providers/adapters/generic.ex | 9 + lib/lasso/core/providers/adapters/llamarpc.ex | 49 +--- lib/lasso/core/providers/adapters/merkle.ex | 49 +--- lib/lasso/core/providers/provider_pool.ex | 206 ++------------- lib/lasso/core/request/request_options.ex | 10 +- .../core/request/request_options_builder.ex | 7 +- lib/lasso/core/request/request_pipeline.ex | 7 +- .../request_pipeline/failover_strategy.ex | 71 ++++-- .../request/request_pipeline/observability.ex | 18 +- lib/lasso/core/selection/selection.ex | 34 ++- .../core/selection/strategies/fastest.ex | 23 +- .../selection/strategies/latency_weighted.ex | 28 +- .../selection/strategies/load_balanced.ex | 44 ++++ .../core/strategies/strategy_registry.ex | 11 +- .../core/support/error_classification.ex | 1 + lib/lasso/core/support/error_normalizer.ex | 89 ++++--- .../components/simulator_controls.ex | 12 +- lib/lasso_web/controllers/rpc_controller.ex | 7 +- lib/lasso_web/dashboard/endpoint_helpers.ex | 8 +- lib/lasso_web/router.ex | 6 +- lib/lasso_web/rpc/helpers.ex | 6 +- scripts/rpc_load_test.mjs | 60 +++-- ...on_with_ws_disconnect_integration_test.exs | 2 +- .../passthrough_integration_test.exs | 22 +- .../request_pipeline_integration_test.exs | 38 +-- ...ort_failure_reporting_integration_test.exs | 2 +- .../probe_classification_test.exs | 88 +++++++ .../failover_strategy_test.exs | 130 ++++++++++ .../core/request/request_pipeline_test.exs | 36 +-- .../core/support/error_normalizer_test.exs | 150 +++++++++++ .../rpc/provider_pool_probe_health_test.exs | 100 ++++++++ test/support/lasso/testing/mock_provider.ex | 2 +- test/support/lasso/testing/telemetry_sync.ex | 8 +- test/support/lasso_integration_case.ex | 2 +- 52 files changed, 1423 insertions(+), 673 deletions(-) create mode 100644 docs/ROUTING.md create mode 100644 lib/lasso/core/selection/strategies/load_balanced.ex create mode 100644 test/lasso/core/health_probe/probe_classification_test.exs create mode 100644 test/lasso/core/request/request_pipeline/failover_strategy_test.exs create mode 100644 test/lasso/core/support/error_normalizer_test.exs diff --git a/CHANGELOG.md b/CHANGELOG.md index c67f6c42..b97753c2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Multi-provider, multi-chain Ethereum JSON-RPC proxy for HTTP and WebSocket -- Intelligent routing strategies: fastest, round-robin, latency-weighted +- Intelligent routing strategies: fastest, load-balanced, latency-weighted - Per-method, per-transport latency benchmarking - Profile system for isolated routing configurations (dev/staging/prod/multi-tenant) - Circuit breakers with per-provider, per-transport state diff --git a/README.md b/README.md index 12d06886..6a895cb7 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ Different providers excel at different workloads (hot reads vs archival queries ## Features - **Multi-provider, multi-chain** Ethereum JSON-RPC proxy for **HTTP + WebSocket** -- **Routing strategies**: `fastest`, `round-robin`, `latency-weighted`, plus provider override routes +- **Routing strategies**: `fastest`, `load-balanced`, `latency-weighted`, plus provider override routes - **Method-aware benchmarking**: latency tracked per **provider × method × transport** - **Resilience**: circuit breakers, retries, and transport-aware failover - **WebSocket subscriptions**: multiplexing with optional gap-filling via HTTP on upstream failure @@ -77,7 +77,7 @@ HTTP (POST): - `/rpc/:chain` (default strategy) - `/rpc/fastest/:chain` -- `/rpc/round-robin/:chain` +- `/rpc/load-balanced/:chain` - `/rpc/latency-weighted/:chain` - `/rpc/provider/:provider_id/:chain` (provider override) diff --git a/assets/js/app.js b/assets/js/app.js index 2c994333..4866e3c6 100644 --- a/assets/js/app.js +++ b/assets/js/app.js @@ -804,7 +804,7 @@ const DraggableNetworkViewport = { // Endpoint Selector Hook for Chain Details const EndpointSelector = { mounted() { - this.selectedStrategy = "round-robin"; // default strategy + this.selectedStrategy = "load-balanced"; // default strategy this.selectedProvider = null; // no provider selected by default this.mode = "strategy"; // 'strategy' or 'provider' this.selectedProviderSupportsWs = false; // default to false @@ -1075,7 +1075,7 @@ const EndpointSelector = { if (this.mode === "strategy" && this.selectedStrategy) { const descriptions = { - "round-robin": + "load-balanced": "Distributes requests evenly across all available providers — good for general purpose workloads", "latency-weighted": "Load balanced favoring faster providers — good for high-throughput workloads like indexing and backfilling", diff --git a/config/config.exs b/config/config.exs index 3a199f44..00555738 100644 --- a/config/config.exs +++ b/config/config.exs @@ -13,8 +13,8 @@ config :lasso, LassoWeb.Endpoint, secret_key_base: "YourSecretKeyBaseHere" <> String.duplicate("a", 32) # Default provider selection strategy -# Options: :fastest, :round_robin, :latency_weighted -config :lasso, :provider_selection_strategy, :round_robin +# Options: :fastest, :load_balanced, :latency_weighted +config :lasso, :provider_selection_strategy, :load_balanced # Default HTTP client adapter config :lasso, :http_client, Lasso.RPC.Transport.HTTP.Client.Finch diff --git a/config/profiles/default.yml b/config/profiles/default.yml index 89c1e692..eb8da5ab 100644 --- a/config/profiles/default.yml +++ b/config/profiles/default.yml @@ -1,6 +1,8 @@ --- -name: Lasso Public +name: Lasso Free slug: default +rps_limit: 100 +burst_limit: 500 --- # Lasso RPC Profile Configuration # Frontmatter: Profile metadata (name, slug) @@ -61,12 +63,14 @@ chains: url: "https://eth.drpc.org" ws_url: "wss://eth.drpc.org" archival: true + subscribe_new_heads: true - id: "ethereum_publicnode" name: "PublicNode Ethereum" priority: 3 url: "https://ethereum-rpc.publicnode.com" ws_url: "wss://ethereum-rpc.publicnode.com" + subscribe_new_heads: true archival: false # PHE: Returns blocks but not historical logs (inverted index error) # Temporarily disable: Uses a proxied backend with some nodes using PHE and some not (create inconsistency in log responses) @@ -97,18 +101,23 @@ chains: url: "https://eth.llamarpc.com" ws_url: "wss://eth.llamarpc.com" archival: false # Mixed PHE nodes - default to false to avoid inverted index errors + subscribe_new_heads: false - id: "ethereum_blockpi" name: "BlockPI Ethereum" priority: 7 url: "https://ethereum.public.blockpi.network/v1/rpc/public" archival: true + adapter_config: + max_block_range: 1024 - id: "ethereum_nodies" name: "Nodies Ethereum" priority: 8 url: "https://ethereum-public.nodies.app" archival: true + adapter_config: + max_block_range: 500 - id: "ethereum_onfinality" name: "OnFinality Ethereum" @@ -178,7 +187,7 @@ chains: ui-topology: color: "#0052FF" - size: md + size: lg providers: - id: "base_llamarpc" @@ -186,6 +195,7 @@ chains: priority: 2 url: "https://base.llamarpc.com" ws_url: "wss://base.llamarpc.com" + subscribe_new_heads: true archival: true - id: "base_official" @@ -199,13 +209,17 @@ chains: priority: 4 url: "https://base.publicnode.com" ws_url: "wss://base.publicnode.com" + subscribe_new_heads: false archival: true - id: "base_drpc" name: "dRPC Base" priority: 5 + type: "public" url: "https://base.drpc.org" ws_url: "wss://base.drpc.org" + subscribe_new_heads: false + api_key_required: false archival: true - id: "base_lava" @@ -239,7 +253,7 @@ chains: ui-topology: color: "#28A0F0" - size: md + size: lg providers: - id: "arbitrum_drpc" @@ -248,6 +262,7 @@ chains: url: "https://arbitrum.drpc.org" ws_url: "wss://arbitrum.drpc.org" archival: true + subscribe_new_heads: false - id: "arbitrum_lava" name: "Lava Arbitrum" @@ -261,7 +276,7 @@ chains: url: "https://arbitrum-one-rpc.publicnode.com" ws_url: "wss://arbitrum-one-rpc.publicnode.com" archival: true - subscribe_new_heads: false + subscribe_new_heads: true - id: "arbitrum_meowrpc" name: "Meow RPC Arbitrum" @@ -274,9 +289,187 @@ chains: priority: 6 url: "https://arb-one.api.pocket.network" archival: true -# Tuning guidelines: -# L1: probe_interval_ms ~12s, max_lag_blocks 1-2, new_heads_timeout_ms ~35-42s -# L2: probe_interval_ms ~3-4s, max_lag_blocks 3-5, new_heads_timeout_ms ~15-20s -# Testnets: Use more lenient thresholds -# -# Provider adapter_config: See docs/ADAPTERS.md for provider-specific options + + # ────────────────────────────────────────────────────────────────────────── + # ETHEREUM SEPOLIA (Testnet) + # ────────────────────────────────────────────────────────────────────────── + ethereum-sepolia: + chain_id: 11155111 + name: "Ethereum Sepolia" + block_time_ms: 12000 + + monitoring: + probe_interval_ms: 15000 + lag_alert_threshold_blocks: 5 + + selection: + max_lag_blocks: 3 + archival_threshold: 128 + + websocket: + subscribe_new_heads: true + new_heads_timeout_ms: 42000 + failover: + max_backfill_blocks: 50 + backfill_timeout_ms: 30000 + + ui-topology: + color: "#627EEA" + size: sm + + providers: + - id: "sepolia_drpc" + name: "dRPC Sepolia" + priority: 2 + url: "https://sepolia.drpc.org" + ws_url: "wss://sepolia.drpc.org" + subscribe_new_heads: true + archival: true + + - id: "sepolia_publicnode" + name: "PublicNode Sepolia" + priority: 3 + url: "https://ethereum-sepolia-rpc.publicnode.com" + ws_url: "wss://ethereum-sepolia-rpc.publicnode.com" + subscribe_new_heads: true + archival: false + + - id: "sepolia_1rpc" + name: "1RPC Sepolia" + priority: 4 + url: "https://1rpc.io/sepolia" + archival: false + + - id: "sepolia_onfinality" + name: "OnFinality Sepolia" + priority: 5 + url: "https://eth-sepolia.api.onfinality.io/public" + archival: true + + - id: "sepolia_nodies" + name: "Nodies Sepolia" + priority: 6 + url: "https://ethereum-sepolia-public.nodies.app" + archival: true + adapter_config: + max_block_range: 500 + + # ────────────────────────────────────────────────────────────────────────── + # BASE SEPOLIA (Testnet, L2) + # ────────────────────────────────────────────────────────────────────────── + base-sepolia: + chain_id: 84532 + name: "Base Sepolia" + block_time_ms: 2000 + + monitoring: + probe_interval_ms: 5000 + lag_alert_threshold_blocks: 5 + + selection: + max_lag_blocks: 5 + archival_threshold: 750 + + websocket: + subscribe_new_heads: true + new_heads_timeout_ms: 20000 + failover: + max_backfill_blocks: 50 + backfill_timeout_ms: 30000 + + ui-topology: + color: "#0052FF" + size: sm + + providers: + - id: "base_sepolia_drpc" + name: "dRPC Base Sepolia" + priority: 2 + url: "https://base-sepolia.drpc.org" + ws_url: "wss://base-sepolia.drpc.org" + subscribe_new_heads: true + archival: true + + - id: "base_sepolia_publicnode" + name: "PublicNode Base Sepolia" + priority: 3 + url: "https://base-sepolia-rpc.publicnode.com" + ws_url: "wss://base-sepolia-rpc.publicnode.com" + subscribe_new_heads: true + archival: true + + - id: "base_sepolia_official" + name: "Base Official Sepolia" + priority: 4 + url: "https://sepolia.base.org" + archival: true + + - id: "base_sepolia_onfinality" + name: "OnFinality Base Sepolia" + priority: 5 + url: "https://base-sepolia.api.onfinality.io/public" + archival: true + + - id: "base_sepolia_nodies" + name: "Nodies Base Sepolia" + priority: 6 + url: "https://base-sepolia-public.nodies.app" + archival: true + adapter_config: + max_block_range: 500 + + # ────────────────────────────────────────────────────────────────────────── + # ARBITRUM SEPOLIA (Testnet, L2) + # ────────────────────────────────────────────────────────────────────────── + arbitrum-sepolia: + chain_id: 421614 + name: "Arbitrum Sepolia" + block_time_ms: 250 + + monitoring: + probe_interval_ms: 2000 + lag_alert_threshold_blocks: 15 + + selection: + max_lag_blocks: 10 + archival_threshold: 6000 + + websocket: + subscribe_new_heads: true + new_heads_timeout_ms: 15000 + failover: + max_backfill_blocks: 50 + backfill_timeout_ms: 30000 + + ui-topology: + color: "#28A0F0" + size: sm + + providers: + - id: "arb_sepolia_drpc" + name: "dRPC Arbitrum Sepolia" + priority: 2 + url: "https://arbitrum-sepolia.drpc.org" + ws_url: "wss://arbitrum-sepolia.drpc.org" + subscribe_new_heads: true + archival: true + + - id: "arb_sepolia_publicnode" + name: "PublicNode Arbitrum Sepolia" + priority: 3 + url: "https://arbitrum-sepolia-rpc.publicnode.com" + ws_url: "wss://arbitrum-sepolia-rpc.publicnode.com" + subscribe_new_heads: true + archival: true + + - id: "arb_sepolia_official" + name: "Arbitrum Official Sepolia" + priority: 4 + url: "https://sepolia-rollup.arbitrum.io/rpc" + archival: true + + - id: "arb_sepolia_onfinality" + name: "OnFinality Arbitrum Sepolia" + priority: 5 + url: "https://arbitrum-sepolia.api.onfinality.io/public" + archival: true diff --git a/docs/API_REFERENCE.md b/docs/API_REFERENCE.md index f6b80ec9..590a5e7b 100644 --- a/docs/API_REFERENCE.md +++ b/docs/API_REFERENCE.md @@ -12,7 +12,7 @@ All HTTP RPC endpoints accept `POST` requests with `Content-Type: application/js POST /rpc/:chain ``` -Routes using the default strategy (configurable, defaults to `:round_robin`). +Routes using the default strategy (configurable, defaults to `:load_balanced`). **Example:** @@ -26,7 +26,7 @@ curl -X POST http://localhost:4000/rpc/ethereum \ ``` POST /rpc/fastest/:chain -POST /rpc/round-robin/:chain +POST /rpc/load-balanced/:chain POST /rpc/latency-weighted/:chain ``` @@ -46,7 +46,7 @@ All routes above are available under a profile namespace: ``` POST /rpc/profile/:profile/:chain POST /rpc/profile/:profile/fastest/:chain -POST /rpc/profile/:profile/round-robin/:chain +POST /rpc/profile/:profile/load-balanced/:chain POST /rpc/profile/:profile/latency-weighted/:chain POST /rpc/profile/:profile/provider/:provider_id/:chain ``` diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 25a84777..4df0d543 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -8,7 +8,7 @@ Elixir/OTP application providing RPC provider orchestration and routing for bloc - **Multi-profile isolation**: Independent routing configurations per profile with isolated metrics and circuit breakers - **Transport-agnostic routing**: Unified pipeline routes across HTTP and WebSocket based on real-time performance -- **Provider orchestration**: Pluggable selection strategies (`:fastest`, `:latency_weighted`, `:round_robin`) +- **Provider orchestration**: Pluggable selection strategies (`:fastest`, `:latency_weighted`, `:load_balanced`) - **WebSocket subscription management**: Intelligent multiplexing with automatic failover and gap-filling - **Circuit breaker protection**: Per-provider, per-transport breakers prevent cascade failures - **Method-specific benchmarking**: Passive latency measurement per-chain, per-method, per-transport @@ -526,7 +526,11 @@ Health probes implement exponential backoff for degraded providers to reduce pro ### Available Strategies -**:fastest** (default) +**:load_balanced** (default) + +- Distributes requests across healthy providers with health-aware tiering + +**:fastest** - Lowest latency provider for method (passive benchmarking via BenchmarkStore) @@ -534,10 +538,6 @@ Health probes implement exponential backoff for degraded providers to reduce pro - Weighted random selection by latency scores -**:round_robin** - -- Simple rotation through healthy providers - ### Selection API ```elixir diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index 92e81cc0..cf6e0cc8 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -163,7 +163,7 @@ Strategies control how providers are selected for each request. Set via URL path |----------|----------|-------------| | **Priority** | `/rpc/:chain` | Select by `priority` field (lowest first). Default strategy | | **Fastest** | `/rpc/fastest/:chain` | Lowest latency provider for the method (passive benchmarking) | -| **Round Robin** | `/rpc/round-robin/:chain` | Rotate through healthy providers | +| **Load Balanced** | `/rpc/load-balanced/:chain` | Distribute requests across healthy providers with health-aware tiering | | **Latency Weighted** | `/rpc/latency-weighted/:chain` | Weighted random selection by latency scores | ### When to Use Each Strategy @@ -172,10 +172,23 @@ Strategies control how providers are selected for each request. Set via URL path **Fastest** — Optimal latency. Benchmarks are tracked per provider, per method, per transport. Best for latency-sensitive reads. May concentrate load on one provider. -**Round Robin** — Even distribution. Spreads load across all healthy providers. Good for throughput maximization and avoiding rate limits. +**Load Balanced** — Even distribution with health-aware tiering. Spreads load across all healthy providers, deprioritizing those with tripped circuit breakers or rate limits. Good for throughput maximization and avoiding rate limits. **Latency Weighted** — Balanced approach. Routes more traffic to faster providers while still using slower ones. Prevents single-provider concentration while favoring performance. +### Health-Based Tiering + +All strategies are subject to health-based tiering after initial ranking. The pipeline reorders providers into 4 tiers based on circuit breaker state and rate limit status: + +1. **Tier 1**: Closed circuit + not rate-limited (preferred) +2. **Tier 2**: Closed circuit + rate-limited +3. **Tier 3**: Half-open circuit + not rate-limited +4. **Tier 4**: Half-open circuit + rate-limited + +Open-circuit providers are excluded entirely. Within each tier, the strategy's original ranking is preserved. + +This ensures healthy providers receive traffic first while allowing recovering providers to gradually reintegrate. See [ROUTING.md](ROUTING.md#health-based-tiering) for detailed behavior and examples. + ### Provider Override Route directly to a specific provider, bypassing strategy selection: @@ -273,5 +286,5 @@ config :lasso, :circuit_breaker, ### Default Strategy ```elixir -config :lasso, :provider_selection_strategy, :round_robin +config :lasso, :provider_selection_strategy, :load_balanced ``` diff --git a/docs/FUTURE_FEATURES.md b/docs/FUTURE_FEATURES.md index deb9d795..8997d962 100644 --- a/docs/FUTURE_FEATURES.md +++ b/docs/FUTURE_FEATURES.md @@ -439,7 +439,7 @@ Development log of implemented features, moved from active roadmap. - `:fastest` - Performance-based via benchmarking - `:cheapest` - Cost-optimized (public providers first) - `:priority` - Static config-based ordering -- `:round_robin` - Load balancing across healthy providers +- `:load_balanced` - Load balancing across healthy providers - Strategy selection per endpoint: `/rpc/fastest/:chain`, etc. ### ✅ Request Observability (Sept 2025) diff --git a/docs/OBSERVABILITY.md b/docs/OBSERVABILITY.md index d9d49240..ed6a2db0 100644 --- a/docs/OBSERVABILITY.md +++ b/docs/OBSERVABILITY.md @@ -96,7 +96,7 @@ Lasso's observability system provides comprehensive visibility into RPC request { "event": "rpc.request.completed", "request_id": "uuid-v4", - "strategy": "fastest|priority|round_robin|latency_weighted", + "strategy": "fastest|priority|load_balanced|latency_weighted", "chain": "ethereum", "transport": "http|ws", "jsonrpc_method": "eth_blockNumber", @@ -153,8 +153,9 @@ Lasso's observability system provides comprehensive visibility into RPC request - **protocol**: Transport protocol used - **selection_reason**: Why this provider was selected - `"fastest_method_latency"` - Performance-based (fastest strategy) + - `"static_priority"` - Config priority (priority strategy) - - `"round_robin_rotation"` - Load balancing (round_robin strategy) + - `"load_balanced_rotation"` - Load balancing (load_balanced strategy) - **retries**: Number of retry attempts (0 = first try succeeded) - **circuit_breaker_state**: CB state when request was made - `"closed"` - Healthy, normal operation @@ -203,7 +204,7 @@ X-Lasso-Meta: eyJ2ZXJzaW9uIjoiMS4wIiwic3RyYXRlZ3kiOiJjaGVh... { "version": "1.0", "request_id": "uuid", - "strategy": "round_robin", + "strategy": "load_balanced", "chain": "ethereum", "transport": "http", "selected_provider": { "id": "ethereum_llamarpc", "protocol": "http" }, @@ -229,7 +230,7 @@ Standard JSON-RPC response enriched with `lasso_meta` field: "lasso_meta": { "version": "1.0", "request_id": "uuid", - "strategy": "round_robin", + "strategy": "load_balanced", "chain": "ethereum", "transport": "http", "selected_provider": { "id": "ethereum_llamarpc", "protocol": "http" }, diff --git a/docs/ROUTING.md b/docs/ROUTING.md new file mode 100644 index 00000000..badf3831 --- /dev/null +++ b/docs/ROUTING.md @@ -0,0 +1,240 @@ +# Routing + +Provider selection in Lasso operates as a 4-stage pipeline that transforms a pool of candidate providers into a single execution target. This pipeline balances performance, reliability, and load distribution based on real-time health metrics and historical performance data. + +## Pipeline Overview + +``` +Candidate Pool → Strategy Ranking → Health Tiering → Execution +``` + +1. **Candidate Pool**: All providers configured in the profile that support the requested chain and method +2. **Strategy Ranking**: Strategy-specific ordering (latency, weighted random, priority, etc.) +3. **Health Tiering**: Circuit breaker and rate limit state reordering into 4 tiers +4. **Execution**: Sequential attempts with failover + +The pipeline ensures that healthy providers receive preference while recovering providers are gradually reintroduced. Open-circuit providers are excluded entirely. + +## Routing Strategies + +Strategies control the initial ordering of providers. Select via URL path segment: `/rpc/:strategy/:chain`. + +### Load Balanced (Default) + +**URL**: `/rpc/load-balanced/:chain` +**Module**: `Lasso.RPC.Strategies.LoadBalanced` + +Random distribution across available providers. After shuffling, the tiering pipeline reorders based on circuit breaker state and rate limit status. + +**Use When**: +- Maximizing throughput across multiple providers +- Avoiding rate limits through even distribution +- You have multiple providers of similar quality + +**Behavior**: +- Shuffles providers randomly on each request +- No preference for any provider absent health signals +- Combined with tiering, healthy providers still receive more traffic + +### Fastest + +**URL**: `/rpc/fastest/:chain` +**Module**: `Lasso.RPC.Strategies.Fastest` + +Selects the lowest-latency provider for the specific RPC method based on passive benchmarking. Latency is tracked per provider, per method, per transport. + +**Use When**: +- Latency is the primary concern +- You're willing to concentrate load on the fastest provider +- The method being called is latency-sensitive (e.g., `eth_getBlockByNumber`) + +**Behavior**: +- Ranks providers by measured latency (lowest first) +- Metrics older than 10 minutes are treated as stale and deprioritized +- Providers with no metrics use a dynamic fallback latency (P75 of known providers) +- Still subject to health tiering (closed-circuit providers preferred) + +**Configuration**: +- `FASTEST_MIN_CALLS`: Minimum calls for stable metrics (default: 3) +- `FASTEST_MIN_SUCCESS_RATE`: Minimum success rate filter (default: 0.9) + +### Latency Weighted + +**URL**: `/rpc/latency-weighted/:chain` +**Module**: `Lasso.RPC.Strategies.LatencyWeighted` + +Probabilistic selection weighted by latency, success rate, and confidence. Routes more traffic to faster providers while still using slower ones. + +**Use When**: +- You want a balance between performance and distribution +- Avoiding single-provider concentration is important +- You have providers with significantly different performance profiles + +**Behavior**: +- Calculates a weight for each provider based on the formula: + ``` + weight = (1 / latency^beta) * success_rate * confidence * calls_scale + weight = max(weight, explore_floor) + ``` +- Higher weights = higher selection probability +- Providers with stale metrics (>10min) receive the `explore_floor` weight +- Providers with no metrics use fallback latency and conservative confidence + +**Configuration**: +- `LW_BETA`: Latency exponent (default: 3.0, higher = more aggressive preference for low latency) +- `LW_MS_FLOOR`: Minimum latency denominator (default: 30ms, prevents division by near-zero) +- `LW_EXPLORE_FLOOR`: Minimum weight floor (default: 0.05, ensures all providers get some traffic) +- `LW_MIN_CALLS`: Minimum calls for stable metrics (default: 3) +- `LW_MIN_SR`: Minimum success rate (default: 0.85) + +### Priority + +**URL**: `/rpc/:chain` (implicit when no strategy specified) +**Module**: `Lasso.RPC.Strategies.Priority` + +Selects providers in the order defined by the `priority` field in the profile. Lower priority values are tried first. + +**Use When**: +- You have a preferred provider (e.g., your own node) with fallbacks +- Predictable routing order is more important than performance optimization +- You want explicit control over provider precedence + +**Behavior**: +- Sorts providers by priority field (ascending) +- No dynamic reordering based on performance +- Still subject to health tiering (closed-circuit providers preferred) + +**Configuration**: +Set `priority` in the profile YAML: + +```yaml +providers: + - id: "my_node" + url: "https://my-node.example.com" + priority: 1 + - id: "alchemy_backup" + url: "https://eth-mainnet.g.alchemy.com/v2/${ALCHEMY_API_KEY}" + priority: 2 +``` + +## Health-Based Tiering + +After strategy ranking, the pipeline applies a 4-tier reordering based on circuit breaker state and rate limit status. This ensures healthy providers receive traffic first while allowing recovering providers to gradually reintegrate. + +### The 4 Tiers + +Providers are reordered into these tiers (descending preference): + +1. **Tier 1**: Closed circuit + not rate-limited +2. **Tier 2**: Closed circuit + rate-limited +3. **Tier 3**: Half-open circuit + not rate-limited +4. **Tier 4**: Half-open circuit + rate-limited + +**Excluded**: Open circuit providers are filtered out entirely. + +Within each tier, the strategy's original ranking is preserved. For example, with load-balanced, Tier 1 providers remain shuffled relative to each other. + +### Circuit Breaker States + +Circuit breakers track provider health per transport (HTTP/WS independently): + +- **Closed**: Healthy. Provider is tried normally. +- **Half-open**: Recovering. Provider is deprioritized but periodically probed for recovery. +- **Open**: Failing. Provider is excluded from selection entirely until timeout expires. + +Circuit breakers trip based on consecutive failures and success rate thresholds. See [OBSERVABILITY.md](OBSERVABILITY.md#circuit-breaker-metrics) for metrics and configuration. + +### Rate Limit Detection + +Rate limit status is tracked separately from circuit breakers. A provider can be closed-circuit but rate-limited, meaning it's healthy but temporarily throttled. + +Rate-limited providers are not excluded but are deprioritized to Tier 2 or Tier 4. This allows Lasso to continue using them while preferring providers with available capacity. + +## Example: Why Traffic Distribution May Be Uneven + +Even with the load-balanced strategy, traffic may appear concentrated on certain providers. This is intentional and reflects health tiering. + +**Scenario**: 3 providers (A, B, C) + +- Provider A: Closed circuit, not rate-limited → **Tier 1** +- Provider B: Closed circuit, rate-limited → **Tier 2** +- Provider C: Half-open circuit, not rate-limited → **Tier 3** + +With load-balanced, Lasso shuffles providers then reorders by tier: + +1. Provider A (Tier 1) receives the first attempt on every request +2. Provider B (Tier 2) receives attempts only if A fails +3. Provider C (Tier 3) receives attempts only if A and B both fail + +**Result**: Provider A receives ~100% of traffic as long as it succeeds. This is correct behavior—Lasso routes to the healthiest provider while maintaining fallbacks. + +To achieve truly even distribution, ensure all providers are in Tier 1 (closed circuit + not rate-limited). + +## Strategy-Health Interaction + +All strategies are subject to health tiering. The strategy determines the order within each tier, but tier ordering takes precedence. + +| Strategy | Within-Tier Behavior | Cross-Tier Impact | +|----------|---------------------|-------------------| +| Load Balanced | Random shuffle | No preference, but healthy tiers dominate | +| Fastest | Latency-ordered | Fastest provider may not receive traffic if unhealthy | +| Latency Weighted | Weighted random | Weights apply only within each tier | +| Priority | Priority-ordered | Priority applies only within each tier | + +**Example**: With fastest strategy, if the fastest provider has a half-open circuit, any closed-circuit provider will be tried first, even if it's slower. + +## Execution and Failover + +After tiering, Lasso attempts providers sequentially until success or all providers are exhausted. + +**Success**: First provider that returns a valid RPC response (2xx status, valid JSON-RPC structure) + +**Failure**: Provider is skipped and the next provider in the list is tried. Failures increment circuit breaker counters. + +**All Exhausted**: Returns `503 Service Unavailable` with details about which providers were tried and why they failed. + +See [API_REFERENCE.md](API_REFERENCE.md#error-responses) for error format details. + +## Configuration Reference + +Strategy behavior can be tuned via environment variables: + +| Variable | Strategy | Default | Description | +|----------|----------|---------|-------------| +| `FASTEST_MIN_CALLS` | Fastest | 3 | Minimum calls for stable metrics | +| `FASTEST_MIN_SUCCESS_RATE` | Fastest | 0.9 | Minimum success rate filter | +| `LW_BETA` | Latency Weighted | 3.0 | Latency exponent | +| `LW_MS_FLOOR` | Latency Weighted | 30.0 | Minimum latency denominator (ms) | +| `LW_EXPLORE_FLOOR` | Latency Weighted | 0.05 | Minimum selection probability | +| `LW_MIN_CALLS` | Latency Weighted | 3 | Minimum calls for stable metrics | +| `LW_MIN_SR` | Latency Weighted | 0.85 | Minimum success rate | + +See [CONFIGURATION.md](CONFIGURATION.md#routing-strategies) for profile-level strategy configuration. + +## Per-Method Routing + +Profiles support per-method strategy and provider overrides. This allows fine-grained control for methods with different performance characteristics. + +```yaml +routing: + default_strategy: "load_balanced" + method_overrides: + eth_getLogs: + strategy: "fastest" # Use fastest for log queries + eth_call: + providers: ["alchemy", "quicknode"] # Restrict call to specific providers +``` + +See [CONFIGURATION.md](CONFIGURATION.md#per-method-routing) for full per-method configuration options. + +## Provider Override + +Bypass strategy selection entirely by routing directly to a specific provider: + +``` +POST /rpc/provider/:provider_id/:chain +``` + +The provider ID must match a provider configured in the profile. Health checks and circuit breakers still apply—if the provider is open-circuit, the request will fail. + +This is useful for debugging, testing specific provider behavior, or implementing custom provider selection logic outside of Lasso. diff --git a/lib/lasso/application.ex b/lib/lasso/application.ex index cfcc425c..17dfac26 100644 --- a/lib/lasso/application.ex +++ b/lib/lasso/application.ex @@ -112,7 +112,11 @@ defmodule Lasso.Application do {Lasso.Config.ConfigStore, get_config_store_opts()}, # Start Phoenix endpoint - LassoWeb.Endpoint + LassoWeb.Endpoint, + + # Drain in-flight HTTP requests on shutdown (SIGTERM during deploys). + # Must be AFTER the endpoint so it starts after and stops before it. + {Plug.Cowboy.Drainer, refs: [LassoWeb.Endpoint.HTTP], shutdown: 30_000} ] # See https://hexdocs.pm/elixir/Supervisor.html diff --git a/lib/lasso/config/runtime_config.ex b/lib/lasso/config/runtime_config.ex index 9a1a389a..309608cd 100644 --- a/lib/lasso/config/runtime_config.ex +++ b/lib/lasso/config/runtime_config.ex @@ -85,7 +85,7 @@ defmodule Lasso.Config.RuntimeConfig do """ @spec get_default_strategy() :: atom() def get_default_strategy do - Application.get_env(:lasso, :provider_selection_strategy, :round_robin) + Application.get_env(:lasso, :provider_selection_strategy, :load_balanced) end @doc """ diff --git a/lib/lasso/core/health_probe/batch_coordinator.ex b/lib/lasso/core/health_probe/batch_coordinator.ex index 55f2c838..ca0dc400 100644 --- a/lib/lasso/core/health_probe/batch_coordinator.ex +++ b/lib/lasso/core/health_probe/batch_coordinator.ex @@ -700,9 +700,9 @@ defmodule Lasso.HealthProbe.BatchCoordinator do {:ok, %Response.Error{error: jerror} = _error, _io_ms} -> if jerror.category == :rate_limit do - {:error, jerror} + {:ok, :rate_limited} else - {:ok, :error_response} + {:error, jerror} end {:error, reason, _io_ms} -> @@ -739,11 +739,8 @@ defmodule Lasso.HealthProbe.BatchCoordinator do {:ok, :parse_error} end - {:error, %{category: :rate_limit} = jerr} -> - {:error, jerr} - - {:error, %{retriable?: true} = _jerr} -> - {:ok, :error_response} + {:error, %{category: :rate_limit}} -> + {:ok, :rate_limited} {:error, reason} -> {:error, reason} diff --git a/lib/lasso/core/providers/adapter_helpers.ex b/lib/lasso/core/providers/adapter_helpers.ex index a9ce0f2a..527531cb 100644 --- a/lib/lasso/core/providers/adapter_helpers.ex +++ b/lib/lasso/core/providers/adapter_helpers.ex @@ -6,37 +6,8 @@ defmodule Lasso.RPC.Providers.AdapterHelpers do reducing code duplication and ensuring consistent behavior. """ - @doc """ - Reads adapter config value from context with fallback to default. + alias Lasso.RPC.ChainState - Adapters use this helper to read per-provider configuration overrides - from the `adapter_config` field in provider configuration. If no override - is present, the default value is returned. - - ## Parameters - - * `ctx` - Request context containing provider_config - * `key` - Atom key for the config value (e.g., :eth_get_logs_block_range) - * `default` - Default value to use if key is not found in adapter_config - - ## Examples - - iex> ctx = %{ - ...> provider_config: %{ - ...> adapter_config: %{max_block_range: 100} - ...> } - ...> } - iex> AdapterHelpers.get_adapter_config(ctx, :max_block_range, 50) - 100 - - iex> ctx = %{} - iex> AdapterHelpers.get_adapter_config(ctx, :max_block_range, 50) - 50 - - iex> ctx = %{provider_config: %{adapter_config: %{}}} - iex> AdapterHelpers.get_adapter_config(ctx, :unknown_key, 42) - 42 - """ @spec get_adapter_config(map(), atom(), any()) :: any() def get_adapter_config(ctx, key, default) when is_map(ctx) and is_atom(key) do adapter_config = @@ -50,4 +21,51 @@ defmodule Lasso.RPC.Providers.AdapterHelpers do _ -> default end end + + @spec validate_block_range(list(), map(), pos_integer()) :: :ok | {:error, term()} + def validate_block_range([%{"fromBlock" => from, "toBlock" => to}], ctx, limit) do + with {:ok, range} <- compute_block_range(from, to, ctx), + true <- range > limit do + {:error, {:param_limit, "max #{limit} block range (got #{range})"}} + else + _ -> :ok + end + end + + def validate_block_range(_params, _ctx, _limit), do: :ok + + @spec compute_block_range(term(), term(), map()) :: {:ok, non_neg_integer()} | :error + def compute_block_range(from_block, to_block, ctx) do + with {:ok, from_num} <- parse_block_number(from_block, ctx), + {:ok, to_num} <- parse_block_number(to_block, ctx) do + {:ok, abs(to_num - from_num)} + else + _ -> :error + end + end + + @spec parse_block_number(term(), map()) :: {:ok, non_neg_integer()} | :error + def parse_block_number("latest", ctx), do: {:ok, estimate_current_block(ctx)} + def parse_block_number("earliest", _ctx), do: {:ok, 0} + def parse_block_number("pending", ctx), do: {:ok, estimate_current_block(ctx)} + + def parse_block_number("0x" <> hex, _ctx) do + case Integer.parse(hex, 16) do + {num, ""} -> {:ok, num} + _ -> :error + end + end + + def parse_block_number(num, _ctx) when is_integer(num), do: {:ok, num} + def parse_block_number(_value, _ctx), do: :error + + @spec estimate_current_block(map()) :: non_neg_integer() + def estimate_current_block(ctx) do + chain = Map.get(ctx, :chain, "ethereum") + + case ChainState.consensus_height(chain) do + {:ok, height} -> height + {:error, _} -> 0 + end + end end diff --git a/lib/lasso/core/providers/adapters/1rpc.ex b/lib/lasso/core/providers/adapters/1rpc.ex index bc691779..23045c9a 100644 --- a/lib/lasso/core/providers/adapters/1rpc.ex +++ b/lib/lasso/core/providers/adapters/1rpc.ex @@ -18,7 +18,7 @@ defmodule Lasso.RPC.Providers.Adapters.OneRPC do @behaviour Lasso.RPC.ProviderAdapter - alias Lasso.RPC.{ChainState, MethodRegistry} + alias Lasso.RPC.MethodRegistry alias Lasso.RPC.Providers.Generic import Lasso.RPC.Providers.AdapterHelpers @@ -65,53 +65,6 @@ defmodule Lasso.RPC.Providers.Adapters.OneRPC do def validate_params(_method, _params, _t, _ctx), do: :ok - # Private validation helpers - - defp validate_block_range([%{"fromBlock" => from, "toBlock" => to}], ctx, limit) do - with {:ok, range} <- compute_block_range(from, to, ctx), - true <- range > limit do - {:error, {:param_limit, "max #{limit} block range (got #{range})"}} - else - _ -> :ok - end - end - - defp validate_block_range(_params, _ctx, _limit), do: :ok - - defp compute_block_range(from_block, to_block, ctx) do - with {:ok, from_num} <- parse_block_number(from_block, ctx), - {:ok, to_num} <- parse_block_number(to_block, ctx) do - {:ok, abs(to_num - from_num)} - else - _ -> :error - end - end - - defp parse_block_number("latest", ctx), do: {:ok, estimate_current_block(ctx)} - defp parse_block_number("earliest", _ctx), do: {:ok, 0} - defp parse_block_number("pending", ctx), do: {:ok, estimate_current_block(ctx)} - - defp parse_block_number("0x" <> hex, _ctx) do - case Integer.parse(hex, 16) do - {num, ""} -> {:ok, num} - _ -> :error - end - end - - defp parse_block_number(num, _ctx) when is_integer(num), do: {:ok, num} - defp parse_block_number(_value, _ctx), do: :error - - # Estimates current block from cache, skipping validation if unavailable - # This allows requests to proceed when consensus is unavailable (fail-open) - defp estimate_current_block(ctx) do - chain = Map.get(ctx, :chain, "ethereum") - - case ChainState.consensus_height(chain) do - {:ok, height} -> height - {:error, _} -> 0 - end - end - # Normalization - delegate to Generic adapter @impl true diff --git a/lib/lasso/core/providers/adapters/alchemy.ex b/lib/lasso/core/providers/adapters/alchemy.ex index bdc1518c..4f31cb2f 100644 --- a/lib/lasso/core/providers/adapters/alchemy.ex +++ b/lib/lasso/core/providers/adapters/alchemy.ex @@ -14,7 +14,7 @@ defmodule Lasso.RPC.Providers.Adapters.Alchemy do @behaviour Lasso.RPC.ProviderAdapter - alias Lasso.RPC.{ChainState, MethodRegistry} + alias Lasso.RPC.MethodRegistry alias Lasso.RPC.Providers.Generic import Lasso.RPC.Providers.AdapterHelpers @@ -46,7 +46,7 @@ defmodule Lasso.RPC.Providers.Adapters.Alchemy do block_range_limit = get_adapter_config(ctx, :eth_get_logs_block_range, @default_eth_get_logs_block_range) - case validate_logs_block_range(params, ctx, block_range_limit) do + case validate_block_range(params, ctx, block_range_limit) do :ok -> :ok @@ -63,53 +63,6 @@ defmodule Lasso.RPC.Providers.Adapters.Alchemy do def validate_params(_method, _params, _t, _ctx), do: :ok - # Private validation helpers - - defp validate_logs_block_range([%{"fromBlock" => from, "toBlock" => to}], ctx, limit) do - with {:ok, range} <- compute_block_range(from, to, ctx), - true <- range > limit do - {:error, {:param_limit, "max #{limit} block range (got #{range})"}} - else - _ -> :ok - end - end - - defp validate_logs_block_range(_params, _ctx, _limit), do: :ok - - defp compute_block_range(from_block, to_block, ctx) do - with {:ok, from_num} <- parse_block_number(from_block, ctx), - {:ok, to_num} <- parse_block_number(to_block, ctx) do - {:ok, abs(to_num - from_num)} - else - _ -> :error - end - end - - defp parse_block_number("latest", ctx), do: {:ok, estimate_current_block(ctx)} - defp parse_block_number("earliest", _ctx), do: {:ok, 0} - defp parse_block_number("pending", ctx), do: {:ok, estimate_current_block(ctx)} - - defp parse_block_number("0x" <> hex, _ctx) do - case Integer.parse(hex, 16) do - {num, ""} -> {:ok, num} - _ -> :error - end - end - - defp parse_block_number(num, _ctx) when is_integer(num), do: {:ok, num} - defp parse_block_number(_value, _ctx), do: :error - - # Estimates current block from cache, skipping validation if unavailable - # This allows requests to proceed when consensus is unavailable (fail-open) - defp estimate_current_block(ctx) do - chain = Map.get(ctx, :chain, "ethereum") - - case ChainState.consensus_height(chain) do - {:ok, height} -> height - {:error, _} -> 0 - end - end - # Normalization - delegate to Generic adapter @impl true diff --git a/lib/lasso/core/providers/adapters/drpc.ex b/lib/lasso/core/providers/adapters/drpc.ex index becdc9bc..19d0ac0a 100644 --- a/lib/lasso/core/providers/adapters/drpc.ex +++ b/lib/lasso/core/providers/adapters/drpc.ex @@ -22,7 +22,7 @@ defmodule Lasso.RPC.Providers.Adapters.DRPC do @behaviour Lasso.RPC.ProviderAdapter - alias Lasso.RPC.{ChainState, MethodRegistry} + alias Lasso.RPC.MethodRegistry alias Lasso.RPC.Providers.Generic import Lasso.RPC.Providers.AdapterHelpers @@ -69,53 +69,6 @@ defmodule Lasso.RPC.Providers.Adapters.DRPC do def validate_params(_method, _params, _t, _ctx), do: :ok - # Private validation helpers - - defp validate_block_range([%{"fromBlock" => from, "toBlock" => to}], ctx, limit) do - with {:ok, range} <- compute_block_range(from, to, ctx), - true <- range > limit do - {:error, {:param_limit, "max #{limit} block range (got #{range})"}} - else - _ -> :ok - end - end - - defp validate_block_range(_params, _ctx, _limit), do: :ok - - defp compute_block_range(from_block, to_block, ctx) do - with {:ok, from_num} <- parse_block_number(from_block, ctx), - {:ok, to_num} <- parse_block_number(to_block, ctx) do - {:ok, abs(to_num - from_num)} - else - _ -> :error - end - end - - defp parse_block_number("latest", ctx), do: {:ok, estimate_current_block(ctx)} - defp parse_block_number("earliest", _ctx), do: {:ok, 0} - defp parse_block_number("pending", ctx), do: {:ok, estimate_current_block(ctx)} - - defp parse_block_number("0x" <> hex, _ctx) do - case Integer.parse(hex, 16) do - {num, ""} -> {:ok, num} - _ -> :error - end - end - - defp parse_block_number(num, _ctx) when is_integer(num), do: {:ok, num} - defp parse_block_number(_value, _ctx), do: :error - - # Estimates current block from cache, skipping validation if unavailable - # This allows requests to proceed when consensus is unavailable (fail-open) - defp estimate_current_block(ctx) do - chain = Map.get(ctx, :chain, "ethereum") - - case ChainState.consensus_height(chain) do - {:ok, height} -> height - {:error, _} -> 0 - end - end - # Normalization - delegate to Generic adapter @impl true diff --git a/lib/lasso/core/providers/adapters/generic.ex b/lib/lasso/core/providers/adapters/generic.ex index 0db904ba..00a00d93 100644 --- a/lib/lasso/core/providers/adapters/generic.ex +++ b/lib/lasso/core/providers/adapters/generic.ex @@ -18,12 +18,21 @@ defmodule Lasso.RPC.Providers.Generic do alias Lasso.JSONRPC.Error, as: JError + import Lasso.RPC.Providers.AdapterHelpers + # Capability Validation (Permissive - assumes all methods supported) @impl true def supports_method?(_method, _transport, _ctx), do: :ok @impl true + def validate_params("eth_getLogs", params, _transport, ctx) do + case get_adapter_config(ctx, :max_block_range, nil) do + nil -> :ok + limit -> validate_block_range(params, ctx, limit) + end + end + def validate_params(_method, _params, _transport, _ctx), do: :ok # Normalization (Standard JSON-RPC 2.0) diff --git a/lib/lasso/core/providers/adapters/llamarpc.ex b/lib/lasso/core/providers/adapters/llamarpc.ex index 505a09c3..2d926723 100644 --- a/lib/lasso/core/providers/adapters/llamarpc.ex +++ b/lib/lasso/core/providers/adapters/llamarpc.ex @@ -17,7 +17,7 @@ defmodule Lasso.RPC.Providers.Adapters.LlamaRPC do @behaviour Lasso.RPC.ProviderAdapter - alias Lasso.RPC.{ChainState, MethodRegistry} + alias Lasso.RPC.MethodRegistry alias Lasso.RPC.Providers.Generic import Lasso.RPC.Providers.AdapterHelpers @@ -64,53 +64,6 @@ defmodule Lasso.RPC.Providers.Adapters.LlamaRPC do def validate_params(_method, _params, _t, _ctx), do: :ok - # Private validation helpers - - defp validate_block_range([%{"fromBlock" => from, "toBlock" => to}], ctx, limit) do - with {:ok, range} <- compute_block_range(from, to, ctx), - true <- range > limit do - {:error, {:param_limit, "max #{limit} block range (got #{range})"}} - else - _ -> :ok - end - end - - defp validate_block_range(_params, _ctx, _limit), do: :ok - - defp compute_block_range(from_block, to_block, ctx) do - with {:ok, from_num} <- parse_block_number(from_block, ctx), - {:ok, to_num} <- parse_block_number(to_block, ctx) do - {:ok, abs(to_num - from_num)} - else - _ -> :error - end - end - - defp parse_block_number("latest", ctx), do: {:ok, estimate_current_block(ctx)} - defp parse_block_number("earliest", _ctx), do: {:ok, 0} - defp parse_block_number("pending", ctx), do: {:ok, estimate_current_block(ctx)} - - defp parse_block_number("0x" <> hex, _ctx) do - case Integer.parse(hex, 16) do - {num, ""} -> {:ok, num} - _ -> :error - end - end - - defp parse_block_number(num, _ctx) when is_integer(num), do: {:ok, num} - defp parse_block_number(_value, _ctx), do: :error - - # Estimates current block from cache, skipping validation if unavailable - # This allows requests to proceed when consensus is unavailable (fail-open) - defp estimate_current_block(ctx) do - chain = Map.get(ctx, :chain, "ethereum") - - case ChainState.consensus_height(chain) do - {:ok, height} -> height - {:error, _} -> 0 - end - end - # Normalization - delegate to Generic adapter @impl true diff --git a/lib/lasso/core/providers/adapters/merkle.ex b/lib/lasso/core/providers/adapters/merkle.ex index 49a5fea0..5a216e45 100644 --- a/lib/lasso/core/providers/adapters/merkle.ex +++ b/lib/lasso/core/providers/adapters/merkle.ex @@ -17,7 +17,7 @@ defmodule Lasso.RPC.Providers.Adapters.Merkle do @behaviour Lasso.RPC.ProviderAdapter - alias Lasso.RPC.{ChainState, MethodRegistry} + alias Lasso.RPC.MethodRegistry alias Lasso.RPC.Providers.Generic import Lasso.RPC.Providers.AdapterHelpers @@ -64,53 +64,6 @@ defmodule Lasso.RPC.Providers.Adapters.Merkle do def validate_params(_method, _params, _t, _ctx), do: :ok - # Private validation helpers - - defp validate_block_range([%{"fromBlock" => from, "toBlock" => to}], ctx, limit) do - with {:ok, range} <- compute_block_range(from, to, ctx), - true <- range > limit do - {:error, {:param_limit, "max #{limit} block range (got #{range})"}} - else - _ -> :ok - end - end - - defp validate_block_range(_params, _ctx, _limit), do: :ok - - defp compute_block_range(from_block, to_block, ctx) do - with {:ok, from_num} <- parse_block_number(from_block, ctx), - {:ok, to_num} <- parse_block_number(to_block, ctx) do - {:ok, abs(to_num - from_num)} - else - _ -> :error - end - end - - defp parse_block_number("latest", ctx), do: {:ok, estimate_current_block(ctx)} - defp parse_block_number("earliest", _ctx), do: {:ok, 0} - defp parse_block_number("pending", ctx), do: {:ok, estimate_current_block(ctx)} - - defp parse_block_number("0x" <> hex, _ctx) do - case Integer.parse(hex, 16) do - {num, ""} -> {:ok, num} - _ -> :error - end - end - - defp parse_block_number(num, _ctx) when is_integer(num), do: {:ok, num} - defp parse_block_number(_value, _ctx), do: :error - - # Estimates current block from cache, skipping validation if unavailable - # This allows requests to proceed when consensus is unavailable (fail-open) - defp estimate_current_block(ctx) do - chain = Map.get(ctx, :chain, "ethereum") - - case ChainState.consensus_height(chain) do - {:ok, height} -> height - {:error, _} -> 0 - end - end - # Normalization - delegate to Generic adapter @impl true diff --git a/lib/lasso/core/providers/provider_pool.ex b/lib/lasso/core/providers/provider_pool.ex index ba0514b8..77cc6806 100644 --- a/lib/lasso/core/providers/provider_pool.ex +++ b/lib/lasso/core/providers/provider_pool.ex @@ -15,6 +15,7 @@ defmodule Lasso.RPC.ProviderPool do require Logger alias Lasso.Core.Support.CircuitBreaker + alias Lasso.Core.Support.ErrorClassification alias Lasso.Events.Provider alias Lasso.JSONRPC.Error, as: JError alias Lasso.RPC.{ChainState, RateLimitState, SelectionFilters} @@ -118,7 +119,7 @@ defmodule Lasso.RPC.ProviderPool do @type profile :: String.t() @type chain_name :: String.t() @type provider_id :: String.t() - @type strategy :: :priority | :round_robin | :fastest | :latency_weighted + @type strategy :: :priority | :load_balanced | :fastest | :latency_weighted # Note: :rate_limited is NOT a health status - rate limits are tracked via RateLimitState @type health_status :: :healthy @@ -477,29 +478,6 @@ defmodule Lasso.RPC.ProviderPool do end end - @doc """ - Reports probe results from ProviderProbe (called by ProviderProbe). - Results are processed in batch to maintain consistency. - - Accepts optional profile as first argument (defaults to "default"). - """ - @deprecated "Use report_probe_results(profile, chain_name, results) instead with explicit profile parameter" - @spec report_probe_results(chain_name, [map()]) :: :ok - def report_probe_results(chain_name, results) when is_list(results) do - IO.warn( - "ProviderPool.report_probe_results/2 defaults to 'default' profile. Pass profile explicitly." - ) - - report_probe_results("default", chain_name, results) - end - - @deprecated "Use update_probe_health/5 instead" - @spec report_probe_results(profile, chain_name, [map()]) :: :ok - def report_probe_results(profile, chain_name, results) - when is_binary(profile) and is_list(results) do - GenServer.cast(via_name(profile, chain_name), {:probe_results, results}) - end - @doc """ Updates a provider's health status based on probe results. @@ -895,17 +873,6 @@ defmodule Lasso.RPC.ProviderPool do def handle_cast({:report_success, provider_id, nil}, state), do: {:noreply, update_provider_success(state, provider_id)} - @impl true - def handle_cast({:probe_results, results}, state) do - # Process ALL results in batch (avoid race conditions) - state = apply_probe_batch(state, results) - - # Broadcast health pulse for dashboard cluster sync - broadcast_health_pulses(state, results) - - {:noreply, state} - end - # Handle newHeads update from WebSocket subscriptions (via BlockHeightMonitor) @impl true def handle_cast({:newheads, provider_id, block_height}, state) do @@ -970,15 +937,25 @@ defmodule Lasso.RPC.ProviderPool do updated = case result do :success -> - if current_transport_status == :connecting do - provider - |> Map.put(status_field, :healthy) - |> Map.put(:consecutive_successes, provider.consecutive_successes + 1) - |> Map.put(:consecutive_failures, 0) - |> Map.put(:last_health_check, now) - |> then(&Map.put(&1, :status, derive_aggregate_status(&1))) - else - Map.put(provider, :last_health_check, now) + cond do + current_transport_status in [:connecting, :degraded] -> + provider + |> Map.put(status_field, :healthy) + |> Map.put(:consecutive_successes, provider.consecutive_successes + 1) + |> Map.put(:consecutive_failures, 0) + |> Map.put(:last_health_check, now) + |> then(&Map.put(&1, :status, derive_aggregate_status(&1))) + + current_transport_status == :unhealthy -> + provider + |> Map.put(status_field, :degraded) + |> Map.put(:consecutive_successes, provider.consecutive_successes + 1) + |> Map.put(:consecutive_failures, @failure_threshold - 1) + |> Map.put(:last_health_check, now) + |> then(&Map.put(&1, :status, derive_aggregate_status(&1))) + + true -> + Map.put(provider, :last_health_check, now) end {:failure, reason} -> @@ -1839,8 +1816,7 @@ defmodule Lasso.RPC.ProviderPool do |> put_provider_and_refresh(provider_id, updated) |> record_rate_limit_for_provider(provider_id, transport, retry_after_ms) - jerr.category == :client_error -> - # Client errors don't affect provider health status + not ErrorClassification.breaker_penalty?(jerr.category) -> updated = provider |> Map.put(:last_error, jerr) @@ -1852,7 +1828,6 @@ defmodule Lasso.RPC.ProviderPool do new_failures = provider.consecutive_failures + 1 new_status = derive_failure_status(new_failures) - # Report to circuit breaker cb_id = {state.profile, state.chain_name, provider_id, transport} CircuitBreaker.record_failure(cb_id, jerr) @@ -1887,7 +1862,7 @@ defmodule Lasso.RPC.ProviderPool do state provider -> - {jerr, context} = normalize_error_for_pool(error, provider_id) + {jerr, _context} = normalize_error_for_pool(error, provider_id) cond do jerr.category == :rate_limit -> @@ -1908,8 +1883,7 @@ defmodule Lasso.RPC.ProviderPool do |> put_provider_and_refresh(provider_id, updated_provider) |> record_rate_limit_for_provider(provider_id, transport, retry_after_ms) - jerr.category == :client_error and context == :live_traffic -> - # Client errors from live traffic don't affect provider health status + not ErrorClassification.breaker_penalty?(jerr.category) -> updated_provider = provider |> Map.merge(%{ @@ -2172,142 +2146,10 @@ defmodule Lasso.RPC.ProviderPool do Map.put(circuit_errors, provider_id, updated_errors) end - # Processes a batch of probe results - defp apply_probe_batch(state, results) do - # First pass: Update sync state for all successful probes - state = - Enum.reduce(results, state, fn result, acc_state -> - if result.success? do - # Store: height, timestamp, sequence number - :ets.insert( - acc_state.table, - {{:provider_sync, acc_state.chain_name, result.provider_id}, - {result.block_height, result.timestamp, result.sequence}} - ) - - # Update HTTP transport availability for health policy (probe uses HTTP) - update_provider_success_http(acc_state, result.provider_id) - else - # Tag failures as coming from health_check context for policy handling - update_provider_failure_http( - acc_state, - result.provider_id, - {:health_check, result.error} - ) - end - end) - - # Second pass: Calculate lag for successful providers only - # NOTE: Consensus is calculated lazily by ChainState - update_provider_lags(state, results) - - state - end - - defp update_provider_lags(state, probe_results) do - # Get consensus height (lazy calculation from ChainState) - case ChainState.consensus_height(state.chain_name) do - {:ok, consensus_height} -> - # Calculate lag only for successful probes - Enum.each(probe_results, fn result -> - if result.success? do - lag = result.block_height - consensus_height - - # Store lag - :ets.insert( - state.table, - {{:provider_lag, state.chain_name, result.provider_id}, lag} - ) - - # Broadcast sync update for dashboard live updates - Phoenix.PubSub.broadcast(Lasso.PubSub, "sync:updates:#{state.profile}", %{ - chain: state.chain_name, - provider_id: result.provider_id, - block_height: result.block_height, - consensus_height: consensus_height, - lag: lag - }) - - # Emit telemetry when lagging beyond threshold - threshold = get_lag_threshold_for_chain(state.profile, state.chain_name) - - if lag < -threshold do - emit_lag_telemetry(state.chain_name, result.provider_id, lag) - end - end - end) - - {:error, _reason} -> - # No consensus available - don't calculate lag - :ok - end - end - defp get_current_sequence(_state) do - # For newHeads, use timestamp-based pseudo-sequence - # In the future, could maintain a sequence counter div(System.system_time(:millisecond), 1000) end - defp get_lag_threshold_for_chain(profile, chain) do - # Get lag threshold from chain configuration (chains.yml) - # Falls back to application config if chain not found - case Lasso.Config.ConfigStore.get_chain(profile, chain) do - {:ok, chain_config} -> - chain_config.monitoring.lag_alert_threshold_blocks - - {:error, _} -> - # Fallback to application config for backwards compatibility - thresholds = - Application.get_env(:lasso, :provider_probe, []) - |> Keyword.get(:lag_threshold_by_chain, %{}) - - Map.get(thresholds, chain) || - Application.get_env(:lasso, :provider_probe, []) - |> Keyword.get(:default_lag_threshold, 10) - end - end - - defp emit_lag_telemetry(chain, provider_id, lag) do - :telemetry.execute( - [:lasso, :provider_probe, :lag_detected], - %{lag_blocks: lag}, - %{chain: chain, provider_id: provider_id} - ) - end - - defp broadcast_health_pulses(state, results) do - node_id = get_local_node_id() - ts = System.system_time(:millisecond) - topic = "block_sync:#{state.profile}:#{state.chain_name}" - - for result <- results do - case Map.get(state.providers, result.provider_id) do - %{consecutive_failures: failures, consecutive_successes: successes} -> - msg = - {:provider_health_pulse, - %{ - profile: state.profile, - chain: state.chain_name, - provider_id: result.provider_id, - node_id: node_id, - consecutive_failures: failures, - consecutive_successes: successes, - ts: ts - }} - - Phoenix.PubSub.broadcast(Lasso.PubSub, topic, msg) - - _ -> - :ok - end - end - end - - defp get_local_node_id do - Lasso.Cluster.Topology.get_self_node_id() - end - @doc """ Returns the via tuple for the ProviderPool GenServer. diff --git a/lib/lasso/core/request/request_options.ex b/lib/lasso/core/request/request_options.ex index 9d0cce98..80ee2f33 100644 --- a/lib/lasso/core/request/request_options.ex +++ b/lib/lasso/core/request/request_options.ex @@ -8,12 +8,13 @@ defmodule Lasso.RPC.RequestOptions do alias Lasso.Config.MethodConstraints - @type strategy :: :fastest | :priority | :round_robin | :latency_weighted + @type strategy :: :fastest | :priority | :load_balanced | :latency_weighted @type transport :: :http | :ws | :both | nil @enforce_keys [:timeout_ms] defstruct profile: "default", - strategy: :round_robin, + account_id: nil, + strategy: :load_balanced, provider_override: nil, transport: nil, failover_on_override: false, @@ -25,6 +26,7 @@ defmodule Lasso.RPC.RequestOptions do @type t :: %__MODULE__{ profile: String.t(), + account_id: String.t() | nil, strategy: strategy, provider_override: String.t() | nil, transport: transport, @@ -54,13 +56,13 @@ defmodule Lasso.RPC.RequestOptions do end defp validate_strategy(strategy) - when strategy in [:fastest, :priority, :round_robin, :latency_weighted], + when strategy in [:fastest, :priority, :load_balanced, :round_robin, :latency_weighted], do: :ok defp validate_strategy(strategy), do: {:error, - "Invalid strategy: #{inspect(strategy)}. Must be one of: :fastest, :priority, :round_robin, :latency_weighted"} + "Invalid strategy: #{inspect(strategy)}. Must be one of: :fastest, :priority, :load_balanced, :latency_weighted"} defp validate_transport(%__MODULE__{transport: transport}, method) do required = MethodConstraints.required_transport_for(method) diff --git a/lib/lasso/core/request/request_options_builder.ex b/lib/lasso/core/request/request_options_builder.ex index 6469e759..79e859c9 100644 --- a/lib/lasso/core/request/request_options_builder.ex +++ b/lib/lasso/core/request/request_options_builder.ex @@ -66,7 +66,7 @@ defmodule Lasso.RPC.RequestOptions.Builder do Useful for CLI tools, internal services, or tests that don't have a Plug.Conn. ## Options (precedence: overrides > params > defaults) - - `:strategy` - Strategy atom (default: from app config) + - `:strategy` - Strategy atom (:load_balanced, :fastest, :latency_weighted, :priority) - `:provider_override` / `:provider_id` - Force specific provider - `:transport` - Transport preference (:http, :ws, :both) - `:failover_on_override` - Retry on other providers if override fails (default: false) @@ -180,7 +180,8 @@ defmodule Lasso.RPC.RequestOptions.Builder do @spec parse_strategy(String.t()) :: RequestOptions.strategy() | nil defp parse_strategy("priority"), do: :priority - defp parse_strategy("round_robin"), do: :round_robin + defp parse_strategy("load_balanced"), do: :load_balanced + defp parse_strategy("round_robin"), do: :load_balanced defp parse_strategy("fastest"), do: :fastest defp parse_strategy("latency_weighted"), do: :latency_weighted defp parse_strategy(nil), do: nil @@ -188,7 +189,7 @@ defmodule Lasso.RPC.RequestOptions.Builder do @spec default_strategy() :: RequestOptions.strategy() defp default_strategy, - do: Application.get_env(:lasso, :provider_selection_strategy, :round_robin) + do: Application.get_env(:lasso, :provider_selection_strategy, :load_balanced) @spec put_request_context(RequestOptions.t(), any()) :: RequestOptions.t() defp put_request_context(%RequestOptions{} = o, nil), do: o diff --git a/lib/lasso/core/request/request_pipeline.ex b/lib/lasso/core/request/request_pipeline.ex index 9bea326f..f0544bc0 100644 --- a/lib/lasso/core/request/request_pipeline.ex +++ b/lib/lasso/core/request/request_pipeline.ex @@ -65,7 +65,7 @@ defmodule Lasso.RPC.RequestPipeline do ## Options Takes a `RequestOptions` struct with: - - `strategy` - Routing strategy (:fastest, :round_robin, :latency_weighted, :priority) + - `strategy` - Routing strategy (:fastest, :load_balanced, :latency_weighted, :priority) - `provider_override` - Force specific provider (optional) - `transport` - Transport preference (:http, :ws, :both) - `failover_on_override` - Retry on other providers if override fails @@ -368,7 +368,7 @@ defmodule Lasso.RPC.RequestPipeline do |> RequestContext.increment_retries() |> RequestContext.track_error_category(error_category) - Observability.record_fast_fail(ctx, channel, failover_reason, reason) + Observability.record_fast_fail(ctx, channel, failover_reason, reason, latency_ms) attempt_channels(rest_channels, ctx) @@ -505,7 +505,8 @@ defmodule Lasso.RPC.RequestPipeline do transport: opts.transport || :http, strategy: opts.strategy, request_id: opts.request_id, - plug_start_time: opts.plug_start_time + plug_start_time: opts.plug_start_time, + account_id: opts.account_id ) end diff --git a/lib/lasso/core/request/request_pipeline/failover_strategy.ex b/lib/lasso/core/request/request_pipeline/failover_strategy.ex index 4a6766f1..95464c0e 100644 --- a/lib/lasso/core/request/request_pipeline/failover_strategy.ex +++ b/lib/lasso/core/request/request_pipeline/failover_strategy.ex @@ -6,27 +6,43 @@ defmodule Lasso.RPC.RequestPipeline.FailoverStrategy do or treat an error as terminal. The actual failover execution is handled by the RequestPipeline. + ## Decision Priority (clause ordering) + + 1. No channels remaining → terminal + 2. `:client_error` → conditional failover (threshold 1) + 3. `retriable? = false` → terminal + 4. `:capability_violation` → conditional failover (threshold 2) + 5. `:rate_limit` → failover + 6. `:server_error` → failover + 7. `:network_error` → failover + 8. `:auth_error` → failover + 9. `:timeout` → failover + 10. `retriable? = true` (generic) → failover + 11. `:circuit_open` → failover + 12. Unknown format → terminal + ## Smart Failover Detection To minimize latency variance, the strategy detects when the same error category occurs repeatedly across multiple providers (e.g., "query returned more than 10000 results"). - After a threshold (default: 2 occurrences), it assumes the error is universal and fails fast. - - This prevents scenarios where: - - Request tries Provider A → "result too large" (2s) - - Request tries Provider B → "result too large" (2s) - - Request tries Provider C → "result too large" (2s) - - Total wasted: 6+ seconds - - Instead: - - Request tries Provider A → "result too large" (2s) - - Request tries Provider B → "result too large" (2s) - - Strategy detects universal failure → fail fast - - Total: 4s (2 providers max) + After a threshold, it assumes the error is universal and fails fast. + + ## Client Error Safety Net + + HTTP 4xx errors classified as `:client_error` are non-retriable by default, but the + category is ambiguous — it's the catch-all for any 4xx that isn't a specific JSON-RPC + standard code. A dead provider returning 400 for all requests looks like a client error. + + To handle this, `:client_error` gets one failover attempt (threshold 1). If two providers + return the same class of error, it's almost certainly a real client error. + + Note: `track_error_category` is called in `handle_channel_error` (request_pipeline.ex) + AFTER `FailoverStrategy.decide`, so the count seen by `decide` reflects errors from + previous attempts, not the current one. This is what makes threshold=1 give exactly + one failover attempt. """ - # After this many providers return the same capability violation, - # assume it's a universal limitation and fail fast + @client_error_failover_threshold 1 @repeated_capability_violation_threshold 2 require Logger @@ -74,8 +90,29 @@ defmodule Lasso.RPC.RequestPipeline.FailoverStrategy do @spec should_failover?(any(), [Channel.t()], RequestContext.t()) :: {boolean(), atom()} defp should_failover?(_reason, [], _ctx), do: {false, :no_channels_remaining} - defp should_failover?(%JError{retriable?: false}, _rest, _ctx), - do: {false, :non_retriable_error} + defp should_failover?(%JError{category: :client_error} = error, _rest, ctx) do + # Count reflects errors from PREVIOUS attempts only — track_error_category + # is called after decide() in request_pipeline.ex handle_channel_error. + repeated_count = RequestContext.get_error_category_count(ctx, :client_error) + + if repeated_count >= @client_error_failover_threshold do + Logger.warning("Repeated client error across providers - treating as terminal", + request_id: ctx.request_id, + error_message: error.message, + method: ctx.method, + repeated_count: repeated_count, + chain: ctx.chain + ) + + {false, :repeated_client_error} + else + {true, :client_error_failover} + end + end + + defp should_failover?(%JError{retriable?: false, category: category}, _rest, _ctx) + when category != :client_error, + do: {false, :non_retriable_error} # Smart detection for capability violations (result size, block range, etc.) # If we've seen this error N times already, assume it's universal and stop diff --git a/lib/lasso/core/request/request_pipeline/observability.ex b/lib/lasso/core/request/request_pipeline/observability.ex index 7492edca..d939e96b 100644 --- a/lib/lasso/core/request/request_pipeline/observability.ex +++ b/lib/lasso/core/request/request_pipeline/observability.ex @@ -57,6 +57,7 @@ defmodule Lasso.RPC.RequestPipeline.Observability do # Publish routing decision for dashboard/analytics (profile-scoped) publish_routing_decision( request_id: ctx.request_id, + account_id: ctx.account_id, profile: profile, chain: ctx.chain, method: method, @@ -116,6 +117,7 @@ defmodule Lasso.RPC.RequestPipeline.Observability do # Publish routing decision (profile-scoped) publish_routing_decision( request_id: ctx.request_id, + account_id: ctx.account_id, profile: profile, chain: ctx.chain, method: method, @@ -168,14 +170,18 @@ defmodule Lasso.RPC.RequestPipeline.Observability do @doc """ Records a fast-fail event when failing over to next channel. - Emits telemetry for failover events with error categorization. + Emits telemetry for failover events with error categorization, + and records the failure in metrics so per-provider success rates + reflect all attempts (not just final responses). """ - @spec record_fast_fail(RequestContext.t(), Channel.t(), atom(), term()) :: :ok + @spec record_fast_fail(RequestContext.t(), Channel.t(), atom(), term(), non_neg_integer()) :: + :ok def record_fast_fail( ctx, %Channel{provider_id: provider_id, transport: transport}, failover_reason, - error_reason + error_reason, + duration_ms ) do profile = ctx.opts.profile @@ -193,6 +199,11 @@ defmodule Lasso.RPC.RequestPipeline.Observability do } ) + # Record failure in metrics so dashboard success rates reflect this attempt + Metrics.record_failure(profile, ctx.chain, provider_id, ctx.method, duration_ms, + transport: transport + ) + # Report failure to ProviderPool (which will update circuit breaker) jerr = JError.from(error_reason, provider_id: provider_id) ProviderPool.report_failure(profile, ctx.chain, provider_id, jerr, transport) @@ -327,6 +338,7 @@ defmodule Lasso.RPC.RequestPipeline.Observability do event = RoutingDecision.new( request_id: opts[:request_id], + account_id: opts[:account_id], profile: opts[:profile], chain: opts[:chain], method: opts[:method], diff --git a/lib/lasso/core/selection/selection.ex b/lib/lasso/core/selection/selection.ex index 59098c17..e6388b84 100644 --- a/lib/lasso/core/selection/selection.ex +++ b/lib/lasso/core/selection/selection.ex @@ -35,7 +35,7 @@ defmodule Lasso.RPC.Selection do Options: - :params => [term()] (RPC params for request analysis, default []) - - :strategy => :fastest | :priority | :round_robin | :latency_weighted (default :round_robin) + - :strategy => :fastest | :priority | :load_balanced | :latency_weighted (default :load_balanced) - :protocol => :http | :ws | :both (default :both) - :exclude => [provider_id] (default []) - :timeout => ms (default 30_000) @@ -47,7 +47,7 @@ defmodule Lasso.RPC.Selection do def select_provider(profile, chain, method, opts \\ []) when is_binary(profile) and is_binary(chain) and is_binary(method) do params = Keyword.get(opts, :params, []) - strategy = Keyword.get(opts, :strategy, :round_robin) + strategy = Keyword.get(opts, :strategy, :load_balanced) protocol = Keyword.get(opts, :protocol, :both) exclude = Keyword.get(opts, :exclude, []) timeout = Keyword.get(opts, :timeout, 30_000) @@ -120,7 +120,7 @@ defmodule Lasso.RPC.Selection do health, and performance metrics to return ordered candidate channels. Options: - - :strategy => :fastest | :priority | :round_robin | :latency_weighted + - :strategy => :fastest | :priority | :load_balanced | :latency_weighted - :transport => :http | :ws | :both (default :both) - :exclude => [provider_id] - :limit => integer (maximum channels to return) @@ -131,7 +131,7 @@ defmodule Lasso.RPC.Selection do @spec select_channels(String.t(), String.t(), String.t(), keyword()) :: [Channel.t()] def select_channels(profile, chain, method, opts \\ []) when is_binary(profile) and is_binary(chain) and is_binary(method) do - strategy = Keyword.get(opts, :strategy, :round_robin) + strategy = Keyword.get(opts, :strategy, :load_balanced) transport = Keyword.get(opts, :transport, :both) exclude = Keyword.get(opts, :exclude, []) limit = Keyword.get(opts, :limit, 1000) @@ -243,9 +243,27 @@ defmodule Lasso.RPC.Selection do ordered_channels = strategy_mod.rank_channels(capable_channels, method, prepared_ctx, profile, chain) - # Tiered selection: partition by circuit state to deprioritize half-open channels. - # Closed-circuit channels come first (healthy), half-open channels come last (recovering). - # Within each tier, the strategy's ranking is preserved (maintains randomization for round-robin). + # Health-based tiering: reorder providers by circuit breaker state and rate limit status. + # + # The 4-tier system ensures healthy providers receive traffic first while allowing + # recovering providers to gradually reintegrate: + # + # 1. Tier 1: Closed circuit + not rate-limited (preferred) + # 2. Tier 2: Closed circuit + rate-limited + # 3. Tier 3: Half-open circuit + not rate-limited + # 4. Tier 4: Half-open circuit + rate-limited + # + # Open-circuit providers are filtered out earlier in the pipeline. + # + # Within each tier, the strategy's ranking is preserved. For example, with + # load-balanced strategy, Tier 1 providers remain shuffled relative to each other, + # but all Tier 1 providers come before any Tier 2 providers. + # + # This tiering explains why traffic may be concentrated on certain providers even + # with load-balanced: if only one provider is in Tier 1, it receives all traffic + # that succeeds, with lower tiers acting as fallbacks. + + # Step 1: Split by circuit breaker state {closed_channels, half_open_channels} = Enum.split_with(ordered_channels, fn channel -> cb_state = Map.get(circuit_state_map, {channel.provider_id, channel.transport}, :closed) @@ -254,7 +272,7 @@ defmodule Lasso.RPC.Selection do tiered_channels = closed_channels ++ half_open_channels - # Rate-limit tiering: deprioritize rate-limited channels (tried last, not excluded). + # Step 2: Within each circuit tier, split by rate limit status # Final order: closed+not-rl, closed+rl, half-open+not-rl, half-open+rl {not_rate_limited, rate_limited} = Enum.split_with(tiered_channels, fn channel -> diff --git a/lib/lasso/core/selection/strategies/fastest.ex b/lib/lasso/core/selection/strategies/fastest.ex index 602ff983..7774b1d0 100644 --- a/lib/lasso/core/selection/strategies/fastest.ex +++ b/lib/lasso/core/selection/strategies/fastest.ex @@ -1,5 +1,26 @@ defmodule Lasso.RPC.Strategies.Fastest do - @moduledoc "Choose fastest provider using method-specific latency scores with quality filters." + @moduledoc """ + Choose fastest provider using method-specific latency scores with quality filters. + + Ranks providers by measured latency (ascending). Latency is tracked per provider, + per method, per transport, allowing method-specific optimization. + + ## Staleness Handling + + Metrics older than 10 minutes are considered stale and treated as cold start. + This prevents routing decisions based on outdated performance data. + + ## Health Interaction + + Strategy ranking is applied first, then health-based tiering reorders providers + by circuit breaker state and rate limit status. A fast provider with a half-open + circuit will be deprioritized below any closed-circuit provider. + + ## Configuration + + - `FASTEST_MIN_CALLS`: Minimum calls for stable metrics (default: 3) + - `FASTEST_MIN_SUCCESS_RATE`: Minimum success rate filter (default: 0.9) + """ @behaviour Lasso.RPC.Strategy diff --git a/lib/lasso/core/selection/strategies/latency_weighted.ex b/lib/lasso/core/selection/strategies/latency_weighted.ex index 82ff09a5..3ccd98f4 100644 --- a/lib/lasso/core/selection/strategies/latency_weighted.ex +++ b/lib/lasso/core/selection/strategies/latency_weighted.ex @@ -5,11 +5,29 @@ defmodule Lasso.RPC.Strategies.LatencyWeighted do Distributes load across available providers with a probabilistic bias toward lower-latency and higher-success providers for the specific RPC method. - Notes on metrics API inconsistency: - - Transport-specific metrics are recorded under augmented method keys - (e.g., "eth_getLogs@http"). To avoid key mismatch, this strategy first - queries transport-specific metrics for HTTP; if missing, it falls back to - WS and finally to transport-agnostic metrics if available. + ## Weight Formula + + Each provider receives a weight calculated as: + + weight = (1 / latency^beta) * success_rate * confidence * calls_scale + weight = max(weight, explore_floor) + + Higher weights increase selection probability. The formula balances performance + (latency), reliability (success rate), data quality (confidence), and exploration + (explore_floor ensures all providers receive some traffic). + + ## Staleness Handling + + Metrics older than 10 minutes receive only the `explore_floor` weight, preventing + routing decisions based on outdated data while maintaining exploration. + + ## Configuration + + - `LW_BETA`: Latency exponent (default: 3.0, higher = stronger latency preference) + - `LW_MS_FLOOR`: Minimum latency denominator (default: 30ms, prevents division by zero) + - `LW_EXPLORE_FLOOR`: Minimum weight (default: 0.05, ensures exploration) + - `LW_MIN_CALLS`: Minimum calls for stable metrics (default: 3) + - `LW_MIN_SR`: Minimum success rate (default: 0.85) """ @behaviour Lasso.RPC.Strategy diff --git a/lib/lasso/core/selection/strategies/load_balanced.ex b/lib/lasso/core/selection/strategies/load_balanced.ex new file mode 100644 index 00000000..73175af7 --- /dev/null +++ b/lib/lasso/core/selection/strategies/load_balanced.ex @@ -0,0 +1,44 @@ +defmodule Lasso.RPC.Strategies.LoadBalanced do + @moduledoc """ + Load-balanced selection using random shuffle with health-aware tiering. + + Randomly distributes requests across available providers. After shuffling, + the selection pipeline applies tiered reordering based on circuit breaker + state and rate limit status: + + - Tier 1: Closed circuit, not rate-limited (preferred) + - Tier 2: Closed circuit, rate-limited + - Tier 3: Half-open circuit, not rate-limited + - Tier 4: Half-open circuit, rate-limited + + This ensures healthy providers receive the majority of traffic while + recovering providers are gradually reintroduced. + """ + + @behaviour Lasso.RPC.Strategy + + alias Lasso.RPC.ProviderPool + + @impl true + def prepare_context(profile, chain, _method, timeout) do + base_ctx = Lasso.RPC.StrategyContext.new(chain, timeout) + + total_requests = + case ProviderPool.get_status(profile, chain) do + {:ok, %{total_requests: tr}} when is_integer(tr) -> tr + {:ok, status} when is_map(status) -> Map.get(status, :total_requests, 0) + _ -> base_ctx.total_requests || 0 + end + + %{base_ctx | total_requests: total_requests} + end + + @doc """ + Strategy-provided channel ranking: random shuffle per call. + """ + @impl true + def rank_channels(channels, _method, ctx, _profile, _chain) do + _ = ctx + Enum.shuffle(channels) + end +end diff --git a/lib/lasso/core/strategies/strategy_registry.ex b/lib/lasso/core/strategies/strategy_registry.ex index e7f1b1bd..31ea0389 100644 --- a/lib/lasso/core/strategies/strategy_registry.ex +++ b/lib/lasso/core/strategies/strategy_registry.ex @@ -6,13 +6,13 @@ defmodule Lasso.RPC.Strategies.Registry do scattering case statements across the codebase. """ - @type strategy :: :fastest | :round_robin | :latency_weighted + @type strategy :: :fastest | :load_balanced | :round_robin | :latency_weighted @doc """ Resolve a strategy atom to its implementation module. Returns the module implementing the Strategy behavior for the given strategy atom. - Falls back to RoundRobin strategy if unknown strategy is provided. + Falls back to LoadBalanced strategy if unknown strategy is provided. The default registry can be overridden via: @@ -26,13 +26,13 @@ defmodule Lasso.RPC.Strategies.Registry do Lasso.RPC.Strategies.Fastest iex> StrategyRegistry.resolve(:unknown) - Lasso.RPC.Strategies.RoundRobin + Lasso.RPC.Strategies.LoadBalanced """ @spec resolve(strategy) :: module() def resolve(strategy) when is_atom(strategy) do registry = Application.get_env(:lasso, :strategy_registry, default_registry()) - Map.get(registry, strategy, Lasso.RPC.Strategies.RoundRobin) + Map.get(registry, strategy, Lasso.RPC.Strategies.LoadBalanced) end @spec strategy_atoms() :: [atom()] @@ -44,7 +44,8 @@ defmodule Lasso.RPC.Strategies.Registry do @spec default_registry() :: %{strategy => module()} def default_registry do %{ - round_robin: Lasso.RPC.Strategies.RoundRobin, + load_balanced: Lasso.RPC.Strategies.LoadBalanced, + round_robin: Lasso.RPC.Strategies.LoadBalanced, fastest: Lasso.RPC.Strategies.Fastest, latency_weighted: Lasso.RPC.Strategies.LatencyWeighted } diff --git a/lib/lasso/core/support/error_classification.ex b/lib/lasso/core/support/error_classification.ex index e6a759ac..a565dc4a 100644 --- a/lib/lasso/core/support/error_classification.ex +++ b/lib/lasso/core/support/error_classification.ex @@ -266,6 +266,7 @@ defmodule Lasso.Core.Support.ErrorClassification do @spec breaker_penalty?(atom()) :: boolean() def breaker_penalty?(:capability_violation), do: false def breaker_penalty?(:rate_limit), do: false + def breaker_penalty?(:client_error), do: false def breaker_penalty?(_category), do: true @doc """ diff --git a/lib/lasso/core/support/error_normalizer.ex b/lib/lasso/core/support/error_normalizer.ex index 8c809c6b..e3017374 100644 --- a/lib/lasso/core/support/error_normalizer.ex +++ b/lib/lasso/core/support/error_normalizer.ex @@ -9,6 +9,8 @@ defmodule Lasso.Core.Support.ErrorNormalizer do All categorization logic is delegated to ErrorClassification for maintainability. """ + require Logger + alias Lasso.Core.Support.ErrorClassifier alias Lasso.JSONRPC.Error, as: JError @@ -124,7 +126,7 @@ defmodule Lasso.Core.Support.ErrorNormalizer do transport = Keyword.get(opts, :transport) # Try to extract nested JSON-RPC error from response body for better classification - {code, message} = extract_nested_error(payload, -32_002, "Server error") + {_, code, message} = extract_nested_error(payload, -32_002, "Server error") # Unified classification with adapter priority %{category: category, retriable?: retriable?, breaker_penalty?: breaker_penalty?} = @@ -156,31 +158,55 @@ defmodule Lasso.Core.Support.ErrorNormalizer do context = Keyword.get(opts, :context, :transport) transport = Keyword.get(opts, :transport) - # Try to extract nested JSON-RPC error from response body for better classification - {code, message} = extract_nested_error(payload, -32_003, "Client error") + case extract_nested_error(payload, -32_003, "Client error") do + {:json_rpc, code, message} -> + # Body is a valid JSON-RPC error envelope — classify normally + %{category: category, retriable?: retriable?, breaker_penalty?: breaker_penalty?} = + ErrorClassifier.classify(code, message, provider_id: provider_id) + + data = + if category == :rate_limit do + add_retry_after(payload, payload) + else + payload + end + + JError.new(code, message, + data: data, + provider_id: provider_id, + source: context, + transport: transport, + category: category, + retriable?: retriable?, + breaker_penalty?: breaker_penalty?, + original_code: code + ) - # Unified classification with adapter priority - %{category: category, retriable?: retriable?, breaker_penalty?: breaker_penalty?} = - ErrorClassifier.classify(code, message, provider_id: provider_id) + {:raw, _code, _message} -> + # Body is NOT a JSON-RPC error envelope (e.g. gateway/proxy/CDN rejection). + # A compliant JSON-RPC provider would return errors in JSON-RPC format. + # Non-JSON-RPC 4xx means the RPC handler never processed the request. + status = Map.get(payload, :status, "4xx") + body = Map.get(payload, :body) + body_snippet = if is_binary(body), do: String.slice(body, 0, 200), else: "" - # Extract retry-after hint if this is a rate limit error (e.g., 429) - data = - if category == :rate_limit do - add_retry_after(payload, payload) - else - payload - end + Logger.warning("Reclassifying non-JSON-RPC 4xx as server_error", + provider_id: provider_id, + status: status, + body: body_snippet + ) - JError.new(code, message, - data: data, - provider_id: provider_id, - source: context, - transport: transport, - category: category, - retriable?: retriable?, - breaker_penalty?: breaker_penalty?, - original_code: code - ) + JError.new(-32_002, "Provider infrastructure error (HTTP #{status})", + data: payload, + provider_id: provider_id, + source: context, + transport: transport, + category: :server_error, + retriable?: true, + breaker_penalty?: true, + original_code: -32_003 + ) + end end # Timeout errors @@ -650,25 +676,26 @@ defmodule Lasso.Core.Support.ErrorNormalizer do defp maybe_add_transport(jerr, _transport), do: jerr - # Extract nested JSON-RPC error from HTTP error payload (e.g., 4xx/5xx with JSON body) + # Extract nested JSON-RPC error from HTTP error payload (e.g., 4xx/5xx with JSON body). + # + # Returns a tagged tuple: + # {:json_rpc, code, message} — body contained a JSON-RPC error envelope + # {:raw, code, message} — body was not JSON-RPC; code/message are fallbacks defp extract_nested_error(%{body: body} = _payload, fallback_code, fallback_message) when is_binary(body) do case Jason.decode(body) do {:ok, %{"error" => %{"code" => code, "message" => message}}} when is_integer(code) -> - {code, message} + {:json_rpc, code, message} {:ok, %{"error" => %{"message" => message}}} -> - # Error without code - use fallback code but preserve message - {fallback_code, message} + {:json_rpc, fallback_code, message} _ -> - # Not a JSON-RPC error or invalid JSON - use fallback - {fallback_code, fallback_message} + {:raw, fallback_code, fallback_message} end end defp extract_nested_error(_payload, fallback_code, fallback_message) do - # No body or non-map payload - use fallback - {fallback_code, fallback_message} + {:raw, fallback_code, fallback_message} end end diff --git a/lib/lasso_web/components/simulator_controls.ex b/lib/lasso_web/components/simulator_controls.ex index 16a25ff2..0e41efe0 100644 --- a/lib/lasso_web/components/simulator_controls.ex +++ b/lib/lasso_web/components/simulator_controls.ex @@ -23,7 +23,7 @@ defmodule LassoWeb.Dashboard.Components.SimulatorControls do |> assign_new(:sim_collapsed, fn -> true end) |> assign_new(:simulator_running, fn -> false end) |> assign_new(:selected_chains, fn -> [] end) - |> assign_new(:selected_strategy, fn -> "round-robin" end) + |> assign_new(:selected_strategy, fn -> "load-balanced" end) |> assign_new(:request_rate, fn -> 5 end) |> assign_new(:run_duration, fn -> 30 end) |> assign_new(:load_types, fn -> %{http: true, ws: true} end) @@ -32,7 +32,7 @@ defmodule LassoWeb.Dashboard.Components.SimulatorControls do |> assign_new(:active_runs, fn -> [] end) |> assign_new(:preview_text, fn -> get_preview_text(%{ - strategy: "round-robin", + strategy: "load-balanced", chains: [], load_types: %{http: true, ws: true} }) @@ -512,7 +512,7 @@ defmodule LassoWeb.Dashboard.Components.SimulatorControls do
<%= for {strategy, label, icon} <- [ - {"round-robin", "Round Robin", "🔄"}, + {"load-balanced", "Load Balanced", "🔄"}, {"fastest", "Fastest", "⚡"}, {"latency-weighted", "Latency Weighted", "⚖️"} ] do %> @@ -670,7 +670,7 @@ defmodule LassoWeb.Dashboard.Components.SimulatorControls do type: "custom", duration: 30_000, profile: profile, - strategy: "round-robin", + strategy: "load-balanced", http: %{ enabled: true, methods: ["eth_blockNumber", "eth_getBalance"], @@ -714,10 +714,10 @@ defmodule LassoWeb.Dashboard.Components.SimulatorControls do defp get_preview_text(%{strategy: strategy, chains: chains}) do strategy_label = case strategy do - "round-robin" -> "Round Robin" + "load-balanced" -> "Load Balanced" "fastest" -> "Fastest" "latency-weighted" -> "Latency Weighted" - _ -> "Round Robin" + _ -> "Load Balanced" end chains_text = diff --git a/lib/lasso_web/controllers/rpc_controller.ex b/lib/lasso_web/controllers/rpc_controller.ex index 7db0ce92..50f2a409 100644 --- a/lib/lasso_web/controllers/rpc_controller.ex +++ b/lib/lasso_web/controllers/rpc_controller.ex @@ -89,8 +89,8 @@ defmodule LassoWeb.RPCController do rpc_with_strategy(conn, params, :fastest) end - @spec rpc_round_robin(Plug.Conn.t(), map()) :: Plug.Conn.t() - def rpc_round_robin(conn, params), do: rpc_with_strategy(conn, params, :round_robin) + @spec rpc_load_balanced(Plug.Conn.t(), map()) :: Plug.Conn.t() + def rpc_load_balanced(conn, params), do: rpc_with_strategy(conn, params, :load_balanced) @spec rpc_latency_weighted(Plug.Conn.t(), map()) :: Plug.Conn.t() def rpc_latency_weighted(conn, params), do: rpc_with_strategy(conn, params, :latency_weighted) @@ -445,7 +445,8 @@ defmodule LassoWeb.RPCController do case conn.assigns[:provider_strategy] do nil -> case params["strategy"] do - "round_robin" -> :round_robin + "load_balanced" -> :load_balanced + "round_robin" -> :load_balanced "fastest" -> :fastest "latency_weighted" -> :latency_weighted _ -> default_provider_strategy() diff --git a/lib/lasso_web/dashboard/endpoint_helpers.ex b/lib/lasso_web/dashboard/endpoint_helpers.ex index c88d939d..e61ed52d 100644 --- a/lib/lasso_web/dashboard/endpoint_helpers.ex +++ b/lib/lasso_web/dashboard/endpoint_helpers.ex @@ -10,7 +10,7 @@ defmodule LassoWeb.Dashboard.EndpointHelpers do require Logger # Available routing strategies (must match router.ex and endpoint.ex) - @available_strategies ["round-robin", "latency-weighted", "fastest"] + @available_strategies ["load-balanced", "latency-weighted", "fastest"] @doc """ Returns list of available routing strategies. @@ -113,19 +113,19 @@ defmodule LassoWeb.Dashboard.EndpointHelpers do defp extract_chain_name(_), do: "ethereum" @doc "Get display name for a strategy" - def strategy_display_name("round-robin"), do: "Load Balanced" + def strategy_display_name("load-balanced"), do: "Load Balanced" def strategy_display_name("latency-weighted"), do: "Latency Weighted" def strategy_display_name("fastest"), do: "Fastest" def strategy_display_name(other), do: other |> String.replace("-", " ") |> String.capitalize() # Strategy icons defp strategy_icon("fastest"), do: "⚡" - defp strategy_icon("round-robin"), do: "🔄" + defp strategy_icon("load-balanced"), do: "🔄" defp strategy_icon("latency-weighted"), do: "⚖️" defp strategy_icon(_), do: "🎯" @doc "Get description for a strategy" - def strategy_description("round-robin") do + def strategy_description("load-balanced") do "Distributes requests evenly across all available providers — good for general purpose workloads" end diff --git a/lib/lasso_web/router.ex b/lib/lasso_web/router.ex index 8edf0550..82edc3b4 100644 --- a/lib/lasso_web/router.ex +++ b/lib/lasso_web/router.ex @@ -48,7 +48,8 @@ defmodule LassoWeb.Router do # Legacy endpoints (no profile slug - uses "default" profile) # Strategy-specific endpoints post("/fastest/:chain_id", RPCController, :rpc_fastest) - post("/round-robin/:chain_id", RPCController, :rpc_round_robin) + post("/load-balanced/:chain_id", RPCController, :rpc_load_balanced) + post("/round-robin/:chain_id", RPCController, :rpc_load_balanced) post("/latency-weighted/:chain_id", RPCController, :rpc_latency_weighted) # Provider override endpoints @@ -62,7 +63,8 @@ defmodule LassoWeb.Router do scope "/profile/:profile" do # Strategy-specific endpoints post("/fastest/:chain_id", RPCController, :rpc_fastest) - post("/round-robin/:chain_id", RPCController, :rpc_round_robin) + post("/load-balanced/:chain_id", RPCController, :rpc_load_balanced) + post("/round-robin/:chain_id", RPCController, :rpc_load_balanced) post("/latency-weighted/:chain_id", RPCController, :rpc_latency_weighted) # Provider override endpoints diff --git a/lib/lasso_web/rpc/helpers.ex b/lib/lasso_web/rpc/helpers.ex index e3ed3f10..403c49f7 100644 --- a/lib/lasso_web/rpc/helpers.ex +++ b/lib/lasso_web/rpc/helpers.ex @@ -12,17 +12,17 @@ defmodule LassoWeb.RPC.Helpers do @doc """ Returns the configured default provider selection strategy. - This reads from application configuration with a fallback to `:round_robin`. + This reads from application configuration with a fallback to `:load_balanced`. ## Examples iex> LassoWeb.RPC.Helpers.default_provider_strategy() - :round_robin + :load_balanced """ @spec default_provider_strategy() :: atom() def default_provider_strategy do - Application.get_env(:lasso, :provider_selection_strategy, :round_robin) + Application.get_env(:lasso, :provider_selection_strategy, :load_balanced) end @doc """ diff --git a/scripts/rpc_load_test.mjs b/scripts/rpc_load_test.mjs index 20ae4077..1aab0d4d 100644 --- a/scripts/rpc_load_test.mjs +++ b/scripts/rpc_load_test.mjs @@ -18,11 +18,12 @@ function parseArgs(argv) { const args = argv.slice(2); const opts = { url: - process.env.RPC_URL || "http://localhost:4000/rpc/round-robin/ethereum", + process.env.RPC_URL || "http://localhost:4000/rpc/load-balanced/ethereum", host: process.env.HOST || null, // base host URL chains: process.env.CHAINS || null, // comma-separated chain names - strategy: process.env.STRATEGY || "round-robin", // routing strategy + strategy: process.env.STRATEGY || "load-balanced", // routing strategy profile: process.env.PROFILE || null, // profile name (e.g., "default", "testnet") + apiKey: process.env.API_KEY || null, // API key for authentication concurrency: Number(process.env.CONCURRENCY) || 16, duration: Number(process.env.DURATION) || 30, // seconds rampUpDuration: Number(process.env.RAMP_UP_DURATION) || null, // seconds (defaults to 10s or 20% of duration, whichever is smaller) @@ -60,6 +61,11 @@ function parseArgs(argv) { opts.profile = String(next()); i++; break; + case "--api-key": + case "-k": + opts.apiKey = String(next()); + i++; + break; case "--concurrency": case "-c": opts.concurrency = Number(next()); @@ -115,16 +121,18 @@ function printHelpAndExit() { console.log( `Usage: node scripts/rpc_load_test.mjs [options]\n\n` + `Options:\n` + - ` -u, --url RPC endpoint (default: http://localhost:4000/rpc/round-robin/ethereum)\n` + + ` -u, --url RPC endpoint (default: http://localhost:4000/rpc/load-balanced/ethereum)\n` + ` (ignored if --chains is used)\n` + ` --host Base host URL (default: http://localhost:4000)\n` + ` (used with --chains)\n` + ` --chains Comma-separated chain names (e.g., "ethereum,base")\n` + ` If specified, tests all chains in parallel\n` + - ` -s, --strategy Routing strategy: round-robin, fastest, latency-weighted\n` + - ` (default: round-robin, used with --chains)\n` + + ` -s, --strategy Routing strategy: load-balanced, fastest, latency-weighted\n` + + ` (default: load-balanced, used with --chains)\n` + ` -p, --profile Profile name (e.g., "default", "testnet")\n` + ` If not specified, uses legacy routes (default profile)\n` + + ` -k, --api-key API key for authentication (lasso_...)\n` + + ` Can also be set via API_KEY environment variable\n` + ` -c, --concurrency Concurrent workers per chain (default: 16)\n` + ` -d, --duration Test duration seconds (default: 30)\n` + ` --ramp-up Ramp-up duration seconds (default: 10s or 20% of duration, whichever is smaller)\n` + @@ -209,13 +217,17 @@ function data_balanceOf(address) { // ----------------------------- // Bootstrap dynamic context // ----------------------------- -async function rpc(url, body, timeoutMs) { +async function rpc(url, body, timeoutMs, apiKey = null) { const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), timeoutMs); try { + const headers = { "content-type": "application/json" }; + if (apiKey) { + headers["x-lasso-api-key"] = apiKey; + } const res = await fetch(url, { method: "POST", - headers: { "content-type": "application/json" }, + headers, body: JSON.stringify(body), signal: controller.signal, }); @@ -248,14 +260,14 @@ async function rpc(url, body, timeoutMs) { } } -async function getLatestBlockNumber(url, timeoutMs) { +async function getLatestBlockNumber(url, timeoutMs, apiKey = null) { const body = { jsonrpc: "2.0", id: 1, method: "eth_blockNumber", params: [] }; - const { ok, json } = await rpc(url, body, timeoutMs); + const { ok, json } = await rpc(url, body, timeoutMs, apiKey); if (!ok || !json || !json.result) return null; return Number(BigInt(json.result)); } -async function getBlockByNumber(url, num, fullTx = false, timeoutMs = 10000) { +async function getBlockByNumber(url, num, fullTx = false, timeoutMs = 10000, apiKey = null) { const param = typeof num === "string" ? num : toHex(num); const body = { jsonrpc: "2.0", @@ -263,7 +275,7 @@ async function getBlockByNumber(url, num, fullTx = false, timeoutMs = 10000) { method: "eth_getBlockByNumber", params: [param, Boolean(fullTx)], }; - const { ok, json } = await rpc(url, body, timeoutMs); + const { ok, json } = await rpc(url, body, timeoutMs, apiKey); if (!ok || !json) return null; return json.result || null; } @@ -272,11 +284,12 @@ async function findRecentTxHash( url, startNumber, searchBack = 20, - timeoutMs = 10000 + timeoutMs = 10000, + apiKey = null ) { let n = startNumber; for (let i = 0; i < searchBack && n >= 0; i++, n--) { - const block = await getBlockByNumber(url, n, false, timeoutMs); + const block = await getBlockByNumber(url, n, false, timeoutMs, apiKey); if ( block && Array.isArray(block.transactions) && @@ -286,16 +299,16 @@ async function findRecentTxHash( } } // Fallback: latest block (hash) even if no tx, txHash undefined - const latest = await getBlockByNumber(url, "latest", false, timeoutMs); + const latest = await getBlockByNumber(url, "latest", false, timeoutMs, apiKey); return { txHash: undefined, blockHash: latest && latest.hash }; } -async function bootstrapContext(url, timeoutMs) { - const latestNum = await getLatestBlockNumber(url, timeoutMs); +async function bootstrapContext(url, timeoutMs, apiKey = null) { + const latestNum = await getLatestBlockNumber(url, timeoutMs, apiKey); if (latestNum == null) { return { latestNumber: 0, txHash: undefined, blockHash: undefined }; } - const found = await findRecentTxHash(url, latestNum, 30, timeoutMs); + const found = await findRecentTxHash(url, latestNum, 30, timeoutMs, apiKey); return { latestNumber: latestNum, txHash: found.txHash, @@ -469,9 +482,9 @@ function parseChains(chainsCsv) { // ----------------------------- // Load generation (single chain) // ----------------------------- -async function runSingleChain(url, chainName, rpsLimit) { +async function runSingleChain(url, chainName, rpsLimit, apiKey = null) { const prefix = chainName ? `[${chainName}] ` : ""; - const ctx = await bootstrapContext(url, options.timeout); + const ctx = await bootstrapContext(url, options.timeout, apiKey); const ctxRef = { latestNumber: ctx.latestNumber, txHash: ctx.txHash, @@ -595,7 +608,7 @@ async function runSingleChain(url, chainName, rpsLimit) { params: methodDef.params(), }; const t0 = nowNs(); - const res = await rpc(url, body, options.timeout); + const res = await rpc(url, body, options.timeout, apiKey); const dtMs = nsToMs(nowNs() - t0); windowReq++; @@ -772,7 +785,7 @@ async function run() { const url = buildChainUrl(host, options.strategy, chain, options.profile); const rpsLimit = chainRates.get(chain) || options.rpsLimit || null; try { - return await runSingleChain(url, chain, rpsLimit); + return await runSingleChain(url, chain, rpsLimit, options.apiKey); } catch (err) { console.error(`[${chain}] Fatal error:`, err.message); return { @@ -872,7 +885,7 @@ async function run() { const afterRpc = pathParts.slice(rpcIndex + 1); // Determine if there's a strategy (check if first part is a valid strategy) - const validStrategies = ["round-robin", "fastest", "latency-weighted"]; + const validStrategies = ["load-balanced", "fastest", "latency-weighted"]; let strategy = null; let chain = null; @@ -897,7 +910,8 @@ async function run() { const result = await runSingleChain( testUrl, chainName, - options.rpsLimit + options.rpsLimit, + options.apiKey ); console.log("\n=== Summary ==="); diff --git a/test/integration/http_inclusion_with_ws_disconnect_integration_test.exs b/test/integration/http_inclusion_with_ws_disconnect_integration_test.exs index 1d67a46e..90c613a1 100644 --- a/test/integration/http_inclusion_with_ws_disconnect_integration_test.exs +++ b/test/integration/http_inclusion_with_ws_disconnect_integration_test.exs @@ -28,7 +28,7 @@ defmodule Lasso.RPC.HttpInclusionWithWsDisconnectIntegrationTest do chain, "eth_blockNumber", [], - %RequestOptions{transport: :http, strategy: :round_robin, timeout_ms: 30_000} + %RequestOptions{transport: :http, strategy: :load_balanced, timeout_ms: 30_000} ) # HTTP candidates should include the provider diff --git a/test/integration/passthrough_integration_test.exs b/test/integration/passthrough_integration_test.exs index b467e5c2..d3d5b665 100644 --- a/test/integration/passthrough_integration_test.exs +++ b/test/integration/passthrough_integration_test.exs @@ -33,7 +33,7 @@ defmodule Lasso.RPC.PassthroughIntegrationTest do chain, "eth_blockNumber", [], - %RequestOptions{strategy: :round_robin, timeout_ms: 30_000} + %RequestOptions{strategy: :load_balanced, timeout_ms: 30_000} ) # Result should be a Response.Success struct with passthrough data @@ -63,7 +63,7 @@ defmodule Lasso.RPC.PassthroughIntegrationTest do chain, "eth_blockNumber", [], - %RequestOptions{strategy: :round_robin, timeout_ms: 30_000} + %RequestOptions{strategy: :load_balanced, timeout_ms: 30_000} ) assert %Response.Success{} = result @@ -90,7 +90,7 @@ defmodule Lasso.RPC.PassthroughIntegrationTest do chain, "eth_blockNumber", [], - %RequestOptions{strategy: :round_robin, timeout_ms: 30_000} + %RequestOptions{strategy: :load_balanced, timeout_ms: 30_000} ) assert %Response.Success{} = result @@ -114,7 +114,7 @@ defmodule Lasso.RPC.PassthroughIntegrationTest do chain, "eth_chainId", [], - %RequestOptions{strategy: :round_robin, timeout_ms: 30_000} + %RequestOptions{strategy: :load_balanced, timeout_ms: 30_000} ) # eth_chainId is handled locally for some chains, but if forwarded, @@ -146,7 +146,7 @@ defmodule Lasso.RPC.PassthroughIntegrationTest do chain, "eth_getBalance", [address, "latest"], - %RequestOptions{strategy: :round_robin, timeout_ms: 30_000} + %RequestOptions{strategy: :load_balanced, timeout_ms: 30_000} ) assert %Response.Success{} = result @@ -173,7 +173,7 @@ defmodule Lasso.RPC.PassthroughIntegrationTest do chain, "eth_blockNumber", [], - %RequestOptions{strategy: :round_robin, timeout_ms: 30_000} + %RequestOptions{strategy: :load_balanced, timeout_ms: 30_000} ) # Even after failover, should get Response.Success with passthrough @@ -206,7 +206,7 @@ defmodule Lasso.RPC.PassthroughIntegrationTest do %RequestOptions{ provider_override: "error_provider", failover_on_override: false, - strategy: :round_robin, + strategy: :load_balanced, timeout_ms: 30_000 } ) @@ -229,7 +229,7 @@ defmodule Lasso.RPC.PassthroughIntegrationTest do chain, "eth_blockNumber", [], - %RequestOptions{strategy: :round_robin, timeout_ms: 30_000} + %RequestOptions{strategy: :load_balanced, timeout_ms: 30_000} ) assert %Response.Success{id: response_id} = result @@ -256,7 +256,7 @@ defmodule Lasso.RPC.PassthroughIntegrationTest do chain, "eth_blockNumber", [], - %RequestOptions{strategy: :round_robin, timeout_ms: 30_000} + %RequestOptions{strategy: :load_balanced, timeout_ms: 30_000} ) assert %Response.Success{} = result @@ -279,7 +279,7 @@ defmodule Lasso.RPC.PassthroughIntegrationTest do chain, "eth_blockNumber", [], - %RequestOptions{strategy: :round_robin, timeout_ms: 30_000} + %RequestOptions{strategy: :load_balanced, timeout_ms: 30_000} ) # Upstream latency should be recorded @@ -308,7 +308,7 @@ defmodule Lasso.RPC.PassthroughIntegrationTest do chain, "eth_blockNumber", [], - %RequestOptions{strategy: :round_robin, timeout_ms: 30_000} + %RequestOptions{strategy: :load_balanced, timeout_ms: 30_000} ) assert %Response.Success{raw_bytes: original_bytes} = result diff --git a/test/integration/request_pipeline_integration_test.exs b/test/integration/request_pipeline_integration_test.exs index 3a761093..7409b3f2 100644 --- a/test/integration/request_pipeline_integration_test.exs +++ b/test/integration/request_pipeline_integration_test.exs @@ -36,7 +36,7 @@ defmodule Lasso.RPC.RequestPipelineIntegrationTest do chain, "eth_blockNumber", [], - %RequestOptions{strategy: :round_robin, timeout_ms: 30_000} + %RequestOptions{strategy: :load_balanced, timeout_ms: 30_000} ) # Verify request succeeded (using backup) @@ -63,7 +63,7 @@ defmodule Lasso.RPC.RequestPipelineIntegrationTest do "eth_blockNumber", [], %RequestOptions{ - strategy: :round_robin, + strategy: :load_balanced, timeout_ms: 30_000, provider_override: "failing_provider" } @@ -92,7 +92,7 @@ defmodule Lasso.RPC.RequestPipelineIntegrationTest do provider_override: "failing_provider", failover_on_override: false, timeout_ms: 30_000, - strategy: :round_robin + strategy: :load_balanced } ) @@ -137,7 +137,7 @@ defmodule Lasso.RPC.RequestPipelineIntegrationTest do provider_override: "flaky", failover_on_override: false, timeout_ms: 30_000, - strategy: :round_robin + strategy: :load_balanced } ) @@ -169,7 +169,7 @@ defmodule Lasso.RPC.RequestPipelineIntegrationTest do provider_override: "flaky", failover_on_override: false, timeout_ms: 30_000, - strategy: :round_robin + strategy: :load_balanced } ) @@ -237,7 +237,7 @@ defmodule Lasso.RPC.RequestPipelineIntegrationTest do chain, "eth_blockNumber", [], - %RequestOptions{strategy: :round_robin, timeout_ms: 30_000} + %RequestOptions{strategy: :load_balanced, timeout_ms: 30_000} ) # Request should succeed via backup - now returns Response.Success @@ -269,7 +269,7 @@ defmodule Lasso.RPC.RequestPipelineIntegrationTest do provider_override: "backup", failover_on_override: false, timeout_ms: 30_000, - strategy: :round_robin + strategy: :load_balanced } ) @@ -296,7 +296,7 @@ defmodule Lasso.RPC.RequestPipelineIntegrationTest do provider_override: "preferred", failover_on_override: true, timeout_ms: 30_000, - strategy: :round_robin + strategy: :load_balanced } ) @@ -324,7 +324,7 @@ defmodule Lasso.RPC.RequestPipelineIntegrationTest do chain, "eth_blockNumber", [], - %RequestOptions{strategy: :round_robin, timeout_ms: 30_000} + %RequestOptions{strategy: :load_balanced, timeout_ms: 30_000} ) # In a real scenario with provider-specific adapter logic: @@ -346,7 +346,7 @@ defmodule Lasso.RPC.RequestPipelineIntegrationTest do chain, "eth_blockNumber", [], - %RequestOptions{strategy: :round_robin, timeout_ms: 30_000} + %RequestOptions{strategy: :load_balanced, timeout_ms: 30_000} ) # Execute with nil params @@ -355,7 +355,7 @@ defmodule Lasso.RPC.RequestPipelineIntegrationTest do chain, "eth_blockNumber", nil, - %RequestOptions{strategy: :round_robin, timeout_ms: 30_000} + %RequestOptions{strategy: :load_balanced, timeout_ms: 30_000} ) # Both should succeed @@ -386,7 +386,7 @@ defmodule Lasso.RPC.RequestPipelineIntegrationTest do provider_override: "provider", failover_on_override: false, timeout_ms: 30_000, - strategy: :round_robin + strategy: :load_balanced } ) @@ -415,7 +415,7 @@ defmodule Lasso.RPC.RequestPipelineIntegrationTest do provider_override: "slow", failover_on_override: false, timeout_ms: 100, - strategy: :round_robin + strategy: :load_balanced } ) @@ -438,7 +438,7 @@ defmodule Lasso.RPC.RequestPipelineIntegrationTest do provider_override: "nonexistent", failover_on_override: false, timeout_ms: 30_000, - strategy: :round_robin + strategy: :load_balanced } ) @@ -462,7 +462,7 @@ defmodule Lasso.RPC.RequestPipelineIntegrationTest do chain, "eth_blockNumber", [], - %RequestOptions{transport: :http, timeout_ms: 30_000, strategy: :round_robin} + %RequestOptions{transport: :http, timeout_ms: 30_000, strategy: :load_balanced} ) # Execute with WS transport override (if supported) @@ -502,7 +502,7 @@ defmodule Lasso.RPC.RequestPipelineIntegrationTest do chain, "eth_blockNumber", [], - %RequestOptions{strategy: :round_robin, timeout_ms: 30_000} + %RequestOptions{strategy: :load_balanced, timeout_ms: 30_000} ) # Wait for start event @@ -547,7 +547,7 @@ defmodule Lasso.RPC.RequestPipelineIntegrationTest do chain, "eth_blockNumber", [], - %RequestOptions{strategy: :round_robin, timeout_ms: 30_000} + %RequestOptions{strategy: :load_balanced, timeout_ms: 30_000} ) # Verify telemetry shows success @@ -582,7 +582,7 @@ defmodule Lasso.RPC.RequestPipelineIntegrationTest do provider_override: "failing", failover_on_override: false, timeout_ms: 30_000, - strategy: :round_robin + strategy: :load_balanced } ) @@ -618,7 +618,7 @@ defmodule Lasso.RPC.RequestPipelineIntegrationTest do chain, "eth_blockNumber", [], - %RequestOptions{strategy: :round_robin, timeout_ms: 30_000} + %RequestOptions{strategy: :load_balanced, timeout_ms: 30_000} ) # Should have emitted at least one start event diff --git a/test/integration/transport_failure_reporting_integration_test.exs b/test/integration/transport_failure_reporting_integration_test.exs index 33d58df2..922d786d 100644 --- a/test/integration/transport_failure_reporting_integration_test.exs +++ b/test/integration/transport_failure_reporting_integration_test.exs @@ -36,7 +36,7 @@ defmodule Lasso.RPC.TransportFailureReportingIntegrationTest do %RequestOptions{ provider_override: "fail_http", failover_on_override: false, - strategy: :round_robin, + strategy: :load_balanced, timeout_ms: 30_000 } ) diff --git a/test/lasso/core/health_probe/probe_classification_test.exs b/test/lasso/core/health_probe/probe_classification_test.exs new file mode 100644 index 00000000..d02697a3 --- /dev/null +++ b/test/lasso/core/health_probe/probe_classification_test.exs @@ -0,0 +1,88 @@ +defmodule Lasso.HealthProbe.ProbeClassificationTest do + @moduledoc """ + Tests for health probe result classification logic. + + The probe classification determines whether a provider is alive or dead based on + the response to `eth_chainId`. The key invariant: only rate-limit responses should + be treated as "alive" (the provider is reachable but busy). All other error responses + mean the provider is broken and should increment consecutive_failures. + + These tests validate the classification patterns inline since the logic is embedded + in BatchCoordinator's private do_http_probe/do_ws_probe functions. We test the same + branching conditions against the same data shapes those functions match on. + """ + + use ExUnit.Case, async: true + + alias Lasso.JSONRPC.Error, as: JError + alias Lasso.RPC.Response + + # Replicate the HTTP probe classification logic from BatchCoordinator.do_http_probe + defp classify_http_probe_error(%Response.Error{error: jerror}) do + if jerror.category == :rate_limit do + {:ok, :rate_limited} + else + {:error, jerror} + end + end + + # Replicate the WS probe classification logic from BatchCoordinator.do_ws_probe + defp classify_ws_probe_error(error) do + case error do + %{category: :rate_limit} -> + {:ok, :rate_limited} + + reason -> + {:error, reason} + end + end + + describe "HTTP probe classification" do + test "success response returns chain ID" do + # eth_chainId success is handled before classification — just verify the shape + chain_id = String.to_integer("1", 16) + assert {:ok, ^chain_id} = {:ok, chain_id} + end + + test "rate_limit error response treated as alive" do + jerror = JError.new(-32_005, "Rate limit exceeded", category: :rate_limit, retriable?: true) + response = %Response.Error{id: 1, jsonrpc: "2.0", error: jerror} + + assert {:ok, :rate_limited} = classify_http_probe_error(response) + end + + test "client_error response treated as failure" do + jerror = JError.new(-32_000, "Bad Request", category: :client_error, retriable?: false) + response = %Response.Error{id: 1, jsonrpc: "2.0", error: jerror} + + assert {:error, ^jerror} = classify_http_probe_error(response) + end + + test "server_error response treated as failure" do + jerror = JError.new(-32_000, "Internal error", category: :server_error, retriable?: true) + response = %Response.Error{id: 1, jsonrpc: "2.0", error: jerror} + + assert {:error, ^jerror} = classify_http_probe_error(response) + end + end + + describe "WS probe classification" do + test "rate_limit error treated as alive" do + jerror = JError.new(-32_005, "Rate limit exceeded", category: :rate_limit, retriable?: true) + + assert {:ok, :rate_limited} = classify_ws_probe_error(jerror) + end + + test "non-rate-limit error treated as failure" do + jerror = JError.new(-32_000, "Server error", category: :server_error, retriable?: true) + + assert {:error, ^jerror} = classify_ws_probe_error(jerror) + end + + test "transport error treated as failure" do + reason = {:ws_not_connected, "ethereum", "test-provider"} + + assert {:error, ^reason} = classify_ws_probe_error(reason) + end + end +end diff --git a/test/lasso/core/request/request_pipeline/failover_strategy_test.exs b/test/lasso/core/request/request_pipeline/failover_strategy_test.exs new file mode 100644 index 00000000..90446ba3 --- /dev/null +++ b/test/lasso/core/request/request_pipeline/failover_strategy_test.exs @@ -0,0 +1,130 @@ +defmodule Lasso.RPC.RequestPipeline.FailoverStrategyTest do + use ExUnit.Case, async: true + + alias Lasso.JSONRPC.Error, as: JError + alias Lasso.RPC.{Channel, RequestContext} + alias Lasso.RPC.RequestPipeline.FailoverStrategy + + defp make_channel(provider_id \\ "test-provider") do + %Channel{ + profile: "default", + chain: "ethereum", + provider_id: provider_id, + transport: :http, + raw_channel: nil, + transport_module: nil, + capabilities: nil + } + end + + defp make_ctx(opts \\ []) do + ctx = RequestContext.new("ethereum", "eth_call", []) + error_categories = Keyword.get(opts, :error_categories, %{}) + %{ctx | repeated_error_categories: error_categories} + end + + describe "client_error failover" do + test "failovers once when channels remain and no prior client errors" do + error = JError.new(-32_000, "Bad Request", category: :client_error, retriable?: false) + channels = [make_channel("provider-b")] + ctx = make_ctx() + + assert {:failover, :client_error_failover} = FailoverStrategy.decide(error, channels, ctx) + end + + test "treats as terminal after threshold reached" do + error = JError.new(-32_000, "Bad Request", category: :client_error, retriable?: false) + channels = [make_channel("provider-b")] + ctx = make_ctx(error_categories: %{client_error: 1}) + + assert {:terminal_error, :repeated_client_error} = + FailoverStrategy.decide(error, channels, ctx) + end + + test "no channels remaining wins over client_error failover" do + error = JError.new(-32_000, "Bad Request", category: :client_error, retriable?: false) + ctx = make_ctx() + + assert {:terminal_error, :no_channels_remaining} = FailoverStrategy.decide(error, [], ctx) + end + end + + describe "non-retriable errors" do + test "invalid_params is terminal" do + error = JError.new(-32_602, "Invalid params", category: :invalid_params, retriable?: false) + channels = [make_channel()] + ctx = make_ctx() + + assert {:terminal_error, :non_retriable_error} = + FailoverStrategy.decide(error, channels, ctx) + end + + test "parse_error is terminal" do + error = JError.new(-32_700, "Parse error", category: :parse_error, retriable?: false) + channels = [make_channel()] + ctx = make_ctx() + + assert {:terminal_error, :non_retriable_error} = + FailoverStrategy.decide(error, channels, ctx) + end + end + + describe "capability_violation tracking is independent of client_error" do + test "capability violation with client_error count does not affect its own threshold" do + error = + JError.new(-32_701, "Max results exceeded", + category: :capability_violation, + retriable?: true + ) + + channels = [make_channel()] + ctx = make_ctx(error_categories: %{client_error: 5}) + + assert {:failover, :capability_violation_detected} = + FailoverStrategy.decide(error, channels, ctx) + end + end + + describe "retriable categories still failover (regression)" do + test "rate_limit → failover" do + error = JError.new(-32_005, "Rate limited", category: :rate_limit, retriable?: true) + channels = [make_channel()] + ctx = make_ctx() + + assert {:failover, :rate_limit_detected} = FailoverStrategy.decide(error, channels, ctx) + end + + test "server_error → failover" do + error = JError.new(-32_000, "Internal error", category: :server_error, retriable?: true) + channels = [make_channel()] + ctx = make_ctx() + + assert {:failover, :server_error_detected} = FailoverStrategy.decide(error, channels, ctx) + end + + test "network_error → failover" do + error = + JError.new(-32_004, "Connection failed", category: :network_error, retriable?: true) + + channels = [make_channel()] + ctx = make_ctx() + + assert {:failover, :network_error_detected} = FailoverStrategy.decide(error, channels, ctx) + end + + test "timeout → failover" do + error = JError.new(-32_000, "Request timeout", category: :timeout, retriable?: true) + channels = [make_channel()] + ctx = make_ctx() + + assert {:failover, :timeout_detected} = FailoverStrategy.decide(error, channels, ctx) + end + + test "circuit_open → failover" do + channels = [make_channel()] + ctx = make_ctx() + + assert {:failover, :circuit_open} = FailoverStrategy.decide(:circuit_open, channels, ctx) + end + end +end diff --git a/test/lasso/core/request/request_pipeline_test.exs b/test/lasso/core/request/request_pipeline_test.exs index e74c6a58..d48e6bb3 100644 --- a/test/lasso/core/request/request_pipeline_test.exs +++ b/test/lasso/core/request/request_pipeline_test.exs @@ -33,7 +33,7 @@ defmodule Lasso.RPC.RequestPipelineTest do result = RequestPipeline.execute_via_channels("ethereum", "eth_blockNumber", [], %RequestOptions{ profile: "default", - strategy: :round_robin, + strategy: :load_balanced, timeout_ms: 30_000 }) @@ -63,7 +63,7 @@ defmodule Lasso.RPC.RequestPipelineTest do result = RequestPipeline.execute_via_channels("ethereum", "eth_blockNumber", [], %RequestOptions{ profile: "default", - strategy: :round_robin, + strategy: :load_balanced, timeout_ms: 30_000 }) @@ -88,7 +88,7 @@ defmodule Lasso.RPC.RequestPipelineTest do result = RequestPipeline.execute_via_channels("ethereum", "eth_blockNumber", [], %RequestOptions{ profile: "default", - strategy: :round_robin, + strategy: :load_balanced, timeout_ms: 30_000 }) @@ -100,7 +100,7 @@ defmodule Lasso.RPC.RequestPipelineTest do result = RequestPipeline.execute_via_channels("ethereum", "eth_blockNumber", [], %RequestOptions{ profile: "default", - strategy: :round_robin, + strategy: :load_balanced, timeout_ms: 30_000 }) @@ -115,7 +115,7 @@ defmodule Lasso.RPC.RequestPipelineTest do "ethereum", "eth_getBalance", ["0x123", "latest"], - %RequestOptions{profile: "default", strategy: :round_robin, timeout_ms: 30_000} + %RequestOptions{profile: "default", strategy: :load_balanced, timeout_ms: 30_000} ) ctx = extract_context(result) @@ -148,11 +148,11 @@ defmodule Lasso.RPC.RequestPipelineTest do assert_result_valid(result) end - test ":round_robin strategy" do + test ":load_balanced strategy" do result = RequestPipeline.execute_via_channels("ethereum", "eth_blockNumber", [], %RequestOptions{ profile: "default", - strategy: :round_robin, + strategy: :load_balanced, timeout_ms: 30_000 }) @@ -179,7 +179,7 @@ defmodule Lasso.RPC.RequestPipelineTest do profile: "default", transport: :http, timeout_ms: 30_000, - strategy: :round_robin + strategy: :load_balanced }) assert_result_valid(result) @@ -191,7 +191,7 @@ defmodule Lasso.RPC.RequestPipelineTest do profile: "default", transport: :ws, timeout_ms: 30_000, - strategy: :round_robin + strategy: :load_balanced }) assert_result_valid(result) @@ -201,7 +201,7 @@ defmodule Lasso.RPC.RequestPipelineTest do result = RequestPipeline.execute_via_channels("ethereum", "eth_blockNumber", [], %RequestOptions{ profile: "default", - strategy: :round_robin, + strategy: :load_balanced, timeout_ms: 30_000 }) @@ -216,7 +216,7 @@ defmodule Lasso.RPC.RequestPipelineTest do RequestPipeline.execute_via_channels("ethereum", "eth_blockNumber", [], %RequestOptions{ profile: "default", timeout_ms: 5000, - strategy: :round_robin + strategy: :load_balanced }) assert_result_valid(result) @@ -226,7 +226,7 @@ defmodule Lasso.RPC.RequestPipelineTest do result = RequestPipeline.execute_via_channels("ethereum", "eth_blockNumber", [], %RequestOptions{ profile: "default", - strategy: :round_robin, + strategy: :load_balanced, timeout_ms: 30_000 }) @@ -243,7 +243,7 @@ defmodule Lasso.RPC.RequestPipelineTest do "ethereum", "eth_getBalance", params, - %RequestOptions{profile: "default", strategy: :round_robin, timeout_ms: 30_000} + %RequestOptions{profile: "default", strategy: :load_balanced, timeout_ms: 30_000} ) ctx = extract_context(result) @@ -254,7 +254,7 @@ defmodule Lasso.RPC.RequestPipelineTest do result = RequestPipeline.execute_via_channels("ethereum", "eth_blockNumber", nil, %RequestOptions{ profile: "default", - strategy: :round_robin, + strategy: :load_balanced, timeout_ms: 30_000 }) @@ -268,7 +268,7 @@ defmodule Lasso.RPC.RequestPipelineTest do result = RequestPipeline.execute_via_channels("ethereum", "eth_test", [], %RequestOptions{ profile: "default", - strategy: :round_robin, + strategy: :load_balanced, timeout_ms: 30_000 }) @@ -280,7 +280,7 @@ defmodule Lasso.RPC.RequestPipelineTest do result = RequestPipeline.execute_via_channels("ethereum", "eth_blockNumber", [], %RequestOptions{ profile: "default", - strategy: :round_robin, + strategy: :load_balanced, timeout_ms: 30_000 }) @@ -293,7 +293,7 @@ defmodule Lasso.RPC.RequestPipelineTest do result = RequestPipeline.execute_via_channels("ethereum", "eth_test", [], %RequestOptions{ profile: "default", - strategy: :round_robin, + strategy: :load_balanced, timeout_ms: 30_000 }) @@ -378,7 +378,7 @@ defmodule Lasso.RPC.RequestPipelineTest do result = RequestPipeline.execute_via_channels("ethereum", "eth_blockNumber", [], %RequestOptions{ profile: "default", - strategy: :round_robin, + strategy: :load_balanced, timeout_ms: 30_000 }) diff --git a/test/lasso/core/support/error_normalizer_test.exs b/test/lasso/core/support/error_normalizer_test.exs new file mode 100644 index 00000000..63897de3 --- /dev/null +++ b/test/lasso/core/support/error_normalizer_test.exs @@ -0,0 +1,150 @@ +defmodule Lasso.Core.Support.ErrorNormalizerTest do + use ExUnit.Case, async: true + + alias Lasso.Core.Support.ErrorNormalizer + alias Lasso.JSONRPC.Error, as: JError + + describe "client_error with JSON-RPC body" do + test "classifies normally when body is a valid JSON-RPC error" do + payload = %{ + status: 400, + body: ~s({"jsonrpc":"2.0","error":{"code":-32602,"message":"Invalid params"},"id":1}) + } + + jerr = ErrorNormalizer.normalize({:client_error, payload}, provider_id: "test") + + assert %JError{} = jerr + assert jerr.category == :invalid_params + assert jerr.retriable? == false + assert jerr.code == -32_602 + end + + test "classifies normally when body has error with message but no code" do + payload = %{ + status: 400, + body: ~s({"error":{"message":"Missing required field"}}) + } + + jerr = ErrorNormalizer.normalize({:client_error, payload}, provider_id: "test") + + assert %JError{} = jerr + assert jerr.message == "Missing required field" + end + + test "detects rate limit in JSON-RPC error body" do + payload = %{ + status: 429, + body: ~s({"error":{"code":-32005,"message":"Rate limit exceeded"}}) + } + + jerr = ErrorNormalizer.normalize({:client_error, payload}, provider_id: "test") + + assert jerr.category == :rate_limit + assert jerr.retriable? == true + end + end + + describe "client_error with non-JSON-RPC body (reclassification)" do + test "reclassifies dRPC-style gateway rejection as server_error" do + payload = %{ + status: 400, + body: ~s({"message":"Invalid request"}\n) + } + + jerr = ErrorNormalizer.normalize({:client_error, payload}, provider_id: "base_drpc") + + assert %JError{} = jerr + assert jerr.category == :server_error + assert jerr.retriable? == true + assert jerr.breaker_penalty? == true + assert jerr.code == -32_002 + assert jerr.message =~ "HTTP 400" + end + + test "reclassifies HTML error page as server_error" do + payload = %{ + status: 403, + body: "

403 Forbidden

" + } + + jerr = ErrorNormalizer.normalize({:client_error, payload}, provider_id: "test") + + assert jerr.category == :server_error + assert jerr.retriable? == true + assert jerr.breaker_penalty? == true + end + + test "reclassifies empty body as server_error" do + payload = %{status: 400, body: ""} + + jerr = ErrorNormalizer.normalize({:client_error, payload}, provider_id: "test") + + assert jerr.category == :server_error + assert jerr.retriable? == true + end + + test "reclassifies invalid JSON body as server_error" do + payload = %{status: 400, body: "not json at all"} + + jerr = ErrorNormalizer.normalize({:client_error, payload}, provider_id: "test") + + assert jerr.category == :server_error + assert jerr.retriable? == true + end + + test "reclassifies non-JSON-RPC JSON body as server_error" do + payload = %{status: 401, body: ~s({"error": "invalid api key"})} + + jerr = ErrorNormalizer.normalize({:client_error, payload}, provider_id: "test") + + assert jerr.category == :server_error + assert jerr.retriable? == true + end + + test "preserves original payload in data field" do + payload = %{status: 400, body: ~s({"message":"Invalid request"})} + + jerr = ErrorNormalizer.normalize({:client_error, payload}, provider_id: "test") + + assert jerr.data == payload + end + + test "preserves provider_id and transport" do + payload = %{status: 400, body: ~s({"message":"Invalid request"})} + + jerr = + ErrorNormalizer.normalize({:client_error, payload}, + provider_id: "base_drpc", + transport: :http + ) + + assert jerr.provider_id == "base_drpc" + assert jerr.transport == :http + end + end + + describe "server_error normalization (unchanged)" do + test "extracts JSON-RPC error from 5xx body" do + payload = %{ + status: 500, + body: ~s({"jsonrpc":"2.0","error":{"code":-32603,"message":"Internal error"},"id":1}) + } + + jerr = ErrorNormalizer.normalize({:server_error, payload}, provider_id: "test") + + assert jerr.code == -32_603 + assert jerr.category == :internal_error + end + + test "uses fallback when 5xx body is not JSON-RPC" do + payload = %{status: 502, body: "Bad Gateway"} + + jerr = ErrorNormalizer.normalize({:server_error, payload}, provider_id: "test") + + assert jerr.code == -32_002 + assert jerr.message == "Server error" + assert jerr.category == :server_error + assert jerr.retriable? == true + end + end +end diff --git a/test/lasso/rpc/provider_pool_probe_health_test.exs b/test/lasso/rpc/provider_pool_probe_health_test.exs index 65db8023..313f98e7 100644 --- a/test/lasso/rpc/provider_pool_probe_health_test.exs +++ b/test/lasso/rpc/provider_pool_probe_health_test.exs @@ -150,6 +150,106 @@ defmodule Lasso.RPC.ProviderPoolProbeHealthTest do assert provider.http_status == old_status end + test "probe success recovers :unhealthy to :degraded (graduated step)" do + p = provider_struct(%{id: "probe_grad_1", name: "P1"}) + chain_config = base_chain_config([p]) + chain = "probe_health_test_grad_1" + + {:ok, _pid} = ProviderPool.start_link({"default", chain, chain_config}) + :ok = ProviderPool.register_provider("default", chain, p.id, p) + + # First get to :healthy via probe + ProviderPool.update_probe_health("default", chain, p.id, :http, :success) + Process.sleep(50) + + # Drive to :unhealthy via live traffic failures (threshold is 3) + for _ <- 1..3 do + ProviderPool.report_failure("default", chain, p.id, {:network_error, "timeout"}, :http) + Process.sleep(20) + end + + {:ok, status} = ProviderPool.get_status("default", chain) + provider = Enum.find(status.providers, &(&1.id == p.id)) + assert provider.http_status == :unhealthy + + # Probe success should graduate :unhealthy → :degraded (not straight to :healthy) + ProviderPool.update_probe_health("default", chain, p.id, :http, :success) + Process.sleep(50) + + {:ok, status} = ProviderPool.get_status("default", chain) + provider = Enum.find(status.providers, &(&1.id == p.id)) + assert provider.http_status == :degraded + end + + test "second probe success completes recovery :degraded to :healthy" do + p = provider_struct(%{id: "probe_grad_2", name: "P1"}) + chain_config = base_chain_config([p]) + chain = "probe_health_test_grad_2" + + {:ok, _pid} = ProviderPool.start_link({"default", chain, chain_config}) + :ok = ProviderPool.register_provider("default", chain, p.id, p) + + # Get to :healthy via probe + ProviderPool.update_probe_health("default", chain, p.id, :http, :success) + Process.sleep(50) + + # Drive to :unhealthy via live traffic failures + for _ <- 1..3 do + ProviderPool.report_failure("default", chain, p.id, {:network_error, "timeout"}, :http) + Process.sleep(20) + end + + {:ok, status} = ProviderPool.get_status("default", chain) + provider = Enum.find(status.providers, &(&1.id == p.id)) + assert provider.http_status == :unhealthy + + # First probe: :unhealthy → :degraded + ProviderPool.update_probe_health("default", chain, p.id, :http, :success) + Process.sleep(50) + + {:ok, status} = ProviderPool.get_status("default", chain) + provider = Enum.find(status.providers, &(&1.id == p.id)) + assert provider.http_status == :degraded + + # Second probe: :degraded → :healthy + ProviderPool.update_probe_health("default", chain, p.id, :http, :success) + Process.sleep(50) + + {:ok, status} = ProviderPool.get_status("default", chain) + provider = Enum.find(status.providers, &(&1.id == p.id)) + assert provider.http_status == :healthy + assert provider.consecutive_failures == 0 + end + + test "probe success recovers :degraded to :healthy (single step)" do + p = provider_struct(%{id: "probe_deg_1", name: "P1"}) + chain_config = base_chain_config([p]) + chain = "probe_health_test_deg_1" + + {:ok, _pid} = ProviderPool.start_link({"default", chain, chain_config}) + :ok = ProviderPool.register_provider("default", chain, p.id, p) + + # Get to :healthy via probe + ProviderPool.update_probe_health("default", chain, p.id, :http, :success) + Process.sleep(50) + + # Drive to :degraded via 1 live traffic failure + ProviderPool.report_failure("default", chain, p.id, {:network_error, "timeout"}, :http) + Process.sleep(50) + + {:ok, status} = ProviderPool.get_status("default", chain) + provider = Enum.find(status.providers, &(&1.id == p.id)) + assert provider.http_status == :degraded + + # Probe success recovers :degraded → :healthy directly + ProviderPool.update_probe_health("default", chain, p.id, :http, :success) + Process.sleep(50) + + {:ok, status} = ProviderPool.get_status("default", chain) + provider = Enum.find(status.providers, &(&1.id == p.id)) + assert provider.http_status == :healthy + end + test "probe failure from :connecting transitions to :degraded (probes resolve stuck state only)" do p = provider_struct(%{id: "probe_multi_fail", name: "P1"}) chain_config = base_chain_config([p]) diff --git a/test/support/lasso/testing/mock_provider.ex b/test/support/lasso/testing/mock_provider.ex index c27c3137..ff608b61 100644 --- a/test/support/lasso/testing/mock_provider.ex +++ b/test/support/lasso/testing/mock_provider.ex @@ -24,7 +24,7 @@ defmodule Lasso.Testing.MockProvider do # Providers are now available for routing {:ok, result, _ctx} = RequestPipeline.execute_via_channels( - "ethereum", "eth_blockNumber", [], %Lasso.RPC.RequestOptions{strategy: :round_robin, timeout_ms: 30_000} + "ethereum", "eth_blockNumber", [], %Lasso.RPC.RequestOptions{strategy: :load_balanced, timeout_ms: 30_000} ) assert result == "0x1000" diff --git a/test/support/lasso/testing/telemetry_sync.ex b/test/support/lasso/testing/telemetry_sync.ex index 1b06eed6..a68379d4 100644 --- a/test/support/lasso/testing/telemetry_sync.ex +++ b/test/support/lasso/testing/telemetry_sync.ex @@ -17,13 +17,13 @@ defmodule Lasso.Testing.TelemetrySync do {:ok, collector} = TelemetrySync.attach_collector([:lasso, :request, :completed]) # Now execute the action that generates telemetry - {:ok, result, _ctx} = RequestPipeline.execute_via_channels(chain, "eth_blockNumber", [], %Lasso.RPC.RequestOptions{strategy: :round_robin, timeout_ms: 30_000}) + {:ok, result, _ctx} = RequestPipeline.execute_via_channels(chain, "eth_blockNumber", [], %Lasso.RPC.RequestOptions{strategy: :load_balanced, timeout_ms: 30_000}) # Wait for the event we collected {:ok, measurements, metadata} = TelemetrySync.await_event(collector, timeout: 1000) # INCORRECT: Execute first, attach second - {:ok, result, _ctx} = RequestPipeline.execute_via_channels(chain, "eth_blockNumber", [], %Lasso.RPC.RequestOptions{strategy: :round_robin, timeout_ms: 30_000}) + {:ok, result, _ctx} = RequestPipeline.execute_via_channels(chain, "eth_blockNumber", [], %Lasso.RPC.RequestOptions{strategy: :load_balanced, timeout_ms: 30_000}) {:ok, collector} = TelemetrySync.attach_collector([:lasso, :request, :completed]) TelemetrySync.await_event(collector, timeout: 1000) # Always times out! """ @@ -187,7 +187,7 @@ defmodule Lasso.Testing.TelemetrySync do chain, "eth_blockNumber", [], - %Lasso.RPC.RequestOptions{strategy: :round_robin, timeout_ms: 30_000} + %Lasso.RPC.RequestOptions{strategy: :load_balanced, timeout_ms: 30_000} ) end, match: [method: "eth_blockNumber"], @@ -229,7 +229,7 @@ defmodule Lasso.Testing.TelemetrySync do chain, "eth_blockNumber", [], - %Lasso.RPC.RequestOptions{strategy: :round_robin, timeout_ms: 30_000} + %Lasso.RPC.RequestOptions{strategy: :load_balanced, timeout_ms: 30_000} ) {:ok, measurements, metadata} = TelemetrySync.await_event(collector) diff --git a/test/support/lasso_integration_case.ex b/test/support/lasso_integration_case.ex index 7681361b..150c2d85 100644 --- a/test/support/lasso_integration_case.ex +++ b/test/support/lasso_integration_case.ex @@ -224,7 +224,7 @@ defmodule Lasso.Test.LassoIntegrationCase do request_opts = %RequestOptions{ transport: Keyword.get(opts, :transport), - strategy: Keyword.get(opts, :strategy, :round_robin), + strategy: Keyword.get(opts, :strategy, :load_balanced), provider_override: Keyword.get(opts, :provider_override), failover_on_override: Keyword.get(opts, :failover_on_override, false), timeout_ms: Keyword.get(opts, :timeout, 30_000)