From 465d076dce1443a76ba80e5b6cc2e10f7d464402 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20K=C5=82eczek?= Date: Thu, 20 Nov 2025 17:01:26 +0100 Subject: [PATCH] perf: Cache dbTables FuzzySet per schema Calculation of hint message when requested relation is not present in schema cache requires creation of a FuzzySet (to use fuzzy search to find candidate tables). For schemas with many tables it is costly. This patch introduces dbTablesFuzzyIndex in SchemaCache to memoize the FuzzySet creation. --- src/PostgREST/Error.hs | 11 +++++------ src/PostgREST/Plan.hs | 12 ++++++------ src/PostgREST/Response.hs | 4 ++-- src/PostgREST/SchemaCache.hs | 33 ++++++++++++++++++++++----------- test/io/big_schema.sql | 18 ++++++++++++++++++ test/io/test_big_schema.py | 20 ++++++++++++++++++++ 6 files changed, 73 insertions(+), 25 deletions(-) diff --git a/src/PostgREST/Error.hs b/src/PostgREST/Error.hs index 39217610cf..bee27bf367 100644 --- a/src/PostgREST/Error.hs +++ b/src/PostgREST/Error.hs @@ -41,6 +41,7 @@ import Network.HTTP.Types.Header (Header) import PostgREST.MediaType (MediaType (..)) import qualified PostgREST.MediaType as MediaType +import PostgREST.SchemaCache (TablesFuzzyIndex) import PostgREST.SchemaCache.Identifiers (QualifiedIdentifier (..), Schema) import PostgREST.SchemaCache.Relationship (Cardinality (..), @@ -49,10 +50,8 @@ import PostgREST.SchemaCache.Relationship (Cardinality (..), RelationshipsMap) import PostgREST.SchemaCache.Routine (Routine (..), RoutineParam (..)) -import PostgREST.SchemaCache.Table (Table (..)) import Protolude - class (ErrorBody a, JSON.ToJSON a) => PgrstError a where status :: a -> HTTP.Status headers :: a -> [Header] @@ -250,7 +249,7 @@ data SchemaCacheError | NoRelBetween Text Text (Maybe Text) Text RelationshipsMap | NoRpc Text Text [Text] MediaType Bool [QualifiedIdentifier] [Routine] | ColumnNotFound Text Text - | TableNotFound Text Text [Table] + | TableNotFound Text Text TablesFuzzyIndex deriving Show instance PgrstError SchemaCacheError where @@ -428,12 +427,12 @@ noRpcHint schema procName params allProcs overloadedProcs = -- | -- Do a fuzzy search in all tables in the same schema and return closest result -tableNotFoundHint :: Text -> Text -> [Table] -> Maybe Text -tableNotFoundHint schema tblName tblList +tableNotFoundHint :: Text -> Text -> TablesFuzzyIndex -> Maybe Text +tableNotFoundHint schema tblName dbTablesFuzzyIndex = fmap (\tbl -> "Perhaps you meant the table '" <> schema <> "." <> tbl <> "'") perhapsTable where perhapsTable = Fuzzy.getOne fuzzyTableSet tblName - fuzzyTableSet = Fuzzy.fromList [ tableName tbl | tbl <- tblList, tableSchema tbl == schema] + fuzzyTableSet = fromMaybe Fuzzy.defaultSet (HM.lookup schema dbTablesFuzzyIndex) compressedRel :: Relationship -> JSON.Value diff --git a/src/PostgREST/Plan.hs b/src/PostgREST/Plan.hs index df62291525..d5e698bd1c 100644 --- a/src/PostgREST/Plan.hs +++ b/src/PostgREST/Plan.hs @@ -172,7 +172,7 @@ dbActionPlan dbAct conf apiReq sCache = case dbAct of wrappedReadPlan :: QualifiedIdentifier -> AppConfig -> SchemaCache -> ApiRequest -> Bool -> Either Error CrudPlan wrappedReadPlan identifier conf sCache apiRequest@ApiRequest{iPreferences=Preferences{..},..} headersOnly = do - qi <- findTable identifier (dbTables sCache) + qi <- findTable identifier sCache rPlan <- readPlan qi conf sCache apiRequest (handler, mediaType) <- mapLeft ApiRequestError $ negotiateContent conf apiRequest qi iAcceptMediaType (dbMediaHandlers sCache) (hasDefaultSelect rPlan) if not (null invalidPrefs) && preferHandling == Just Strict then Left $ ApiRequestError $ InvalidPreferences invalidPrefs else Right () @@ -180,7 +180,7 @@ wrappedReadPlan identifier conf sCache apiRequest@ApiRequest{iPreferences=Prefe mutateReadPlan :: Mutation -> ApiRequest -> QualifiedIdentifier -> AppConfig -> SchemaCache -> Either Error CrudPlan mutateReadPlan mutation apiRequest@ApiRequest{iPreferences=Preferences{..},..} identifier conf sCache = do - qi <- findTable identifier (dbTables sCache) + qi <- findTable identifier sCache rPlan <- readPlan qi conf sCache apiRequest mPlan <- mutatePlan mutation qi apiRequest sCache rPlan if not (null invalidPrefs) && preferHandling == Just Strict then Left $ ApiRequestError $ InvalidPreferences invalidPrefs else Right () @@ -810,10 +810,10 @@ validateAggFunctions aggFunctionsAllowed (Node rp@ReadPlan {select} forest) | otherwise = Node rp <$> traverse (validateAggFunctions aggFunctionsAllowed) forest -- | Lookup table in the schema cache before creating read plan -findTable :: QualifiedIdentifier -> TablesMap -> Either Error QualifiedIdentifier -findTable qi@QualifiedIdentifier{..} tableMap = - case HM.lookup qi tableMap of - Nothing -> Left $ SchemaCacheErr $ TableNotFound qiSchema qiName (HM.elems tableMap) +findTable :: QualifiedIdentifier -> SchemaCache -> Either Error QualifiedIdentifier +findTable qi@QualifiedIdentifier{..} SchemaCache{dbTables, dbTablesFuzzyIndex} = + case HM.lookup qi dbTables of + Nothing -> Left $ SchemaCacheErr $ TableNotFound qiSchema qiName dbTablesFuzzyIndex Just _ -> Right qi addFilters :: ResolverContext -> ApiRequest -> ReadPlanTree -> Either Error ReadPlanTree diff --git a/src/PostgREST/Response.hs b/src/PostgREST/Response.hs index 065c722f71..22d9a5393b 100644 --- a/src/PostgREST/Response.hs +++ b/src/PostgREST/Response.hs @@ -209,10 +209,10 @@ actionResponse (MaybeDbResult InspectPlan{ipHdrsOnly=headersOnly} body) _ versio in Right $ PgrstResponse HTTP.status200 (MediaType.toContentType MTOpenAPI : cLHeader ++ maybeToList (profileHeader schema negotiatedByProfile)) rsBody -actionResponse (NoDbResult (RelInfoPlan qi@QualifiedIdentifier{..})) _ _ _ SchemaCache{dbTables} _ _ = +actionResponse (NoDbResult (RelInfoPlan qi@QualifiedIdentifier{..})) _ _ _ SchemaCache{dbTables, dbTablesFuzzyIndex} _ _ = case HM.lookup qi dbTables of Just tbl -> respondInfo $ allowH tbl - Nothing -> Left $ Error.SchemaCacheErr $ Error.TableNotFound qiSchema qiName (HM.elems dbTables) + Nothing -> Left $ Error.SchemaCacheErr $ Error.TableNotFound qiSchema qiName dbTablesFuzzyIndex where allowH table = let hasPK = not . null $ tablePKCols table in diff --git a/src/PostgREST/SchemaCache.hs b/src/PostgREST/SchemaCache.hs index 528d7905f4..475687484f 100644 --- a/src/PostgREST/SchemaCache.hs +++ b/src/PostgREST/SchemaCache.hs @@ -20,6 +20,7 @@ These queries are executed once at startup or when PostgREST is reloaded. module PostgREST.SchemaCache ( SchemaCache(..) + , TablesFuzzyIndex , querySchemaCache , showSummary , decodeFuncs @@ -66,21 +67,28 @@ import PostgREST.SchemaCache.Table (Column (..), ColumnMap, import qualified PostgREST.MediaType as MediaType -import Control.Arrow ((&&&)) -import Protolude -import System.IO.Unsafe (unsafePerformIO) +import Control.Arrow ((&&&)) +import qualified Data.FuzzySet as Fuzzy +import Protolude +import System.IO.Unsafe (unsafePerformIO) + +type TablesFuzzyIndex = HM.HashMap Schema Fuzzy.FuzzySet data SchemaCache = SchemaCache - { dbTables :: TablesMap - , dbRelationships :: RelationshipsMap - , dbRoutines :: RoutineMap - , dbRepresentations :: RepresentationsMap - , dbMediaHandlers :: MediaHandlerMap - , dbTimezones :: TimezoneNames + { dbTables :: TablesMap + , dbRelationships :: RelationshipsMap + , dbRoutines :: RoutineMap + , dbRepresentations :: RepresentationsMap + , dbMediaHandlers :: MediaHandlerMap + , dbTimezones :: TimezoneNames + -- Fuzzy index of table names per schema to support approximate matching + -- Since index construction can be expensive, we build it once and store in the SchemaCache + -- Haskell lazy evaluation ensures it's only built on first use and memoized afterwards + , dbTablesFuzzyIndex :: TablesFuzzyIndex } instance JSON.ToJSON SchemaCache where - toJSON (SchemaCache tabs rels routs reps hdlers tzs) = JSON.object [ + toJSON (SchemaCache tabs rels routs reps hdlers tzs _) = JSON.object [ "dbTables" .= JSON.toJSON tabs , "dbRelationships" .= JSON.toJSON rels , "dbRoutines" .= JSON.toJSON routs @@ -90,7 +98,7 @@ instance JSON.ToJSON SchemaCache where ] showSummary :: SchemaCache -> Text -showSummary (SchemaCache tbls rels routs reps mediaHdlrs tzs) = +showSummary (SchemaCache tbls rels routs reps mediaHdlrs tzs _) = T.intercalate ", " [ show (HM.size tbls) <> " Relations" , show (HM.size rels) <> " Relationships" @@ -166,6 +174,8 @@ querySchemaCache conf@AppConfig{..} = do , dbRepresentations = reps , dbMediaHandlers = HM.union mHdlers initialMediaHandlers -- the custom handlers will override the initial ones , dbTimezones = tzones + + , dbTablesFuzzyIndex = Fuzzy.fromList <$> HM.fromListWith (<>) ((qiSchema &&& pure . qiName) <$> HM.keys tabsWViewsPks) } where schemas = toList configDbSchemas @@ -203,6 +213,7 @@ removeInternal schemas dbStruct = , dbRepresentations = dbRepresentations dbStruct -- no need to filter, not directly exposed through the API , dbMediaHandlers = dbMediaHandlers dbStruct , dbTimezones = dbTimezones dbStruct + , dbTablesFuzzyIndex = dbTablesFuzzyIndex dbStruct } where hasInternalJunction ComputedRelationship{} = False diff --git a/test/io/big_schema.sql b/test/io/big_schema.sql index 34027c7d22..94326caca7 100644 --- a/test/io/big_schema.sql +++ b/test/io/big_schema.sql @@ -11375,6 +11375,24 @@ ALTER TABLE ONLY apflora.zielber ALTER TABLE apflora."user" ENABLE ROW LEVEL SECURITY; +-- Create many tables to test fuzzy string search +-- computing hints for non existing tables +DO +$$ +DECLARE + r record; +BEGIN + FOR r IN + SELECT + format('CREATE TABLE apflora.unknown_table_%s ()', n) AS ct + FROM + generate_series(1, 950) n + LOOP + EXECUTE r.ct; + END LOOP; +END +$$; + DROP ROLE IF EXISTS postgrest_test_anonymous; CREATE ROLE postgrest_test_anonymous; diff --git a/test/io/test_big_schema.py b/test/io/test_big_schema.py index ae8e8a9902..4af2b9f9d3 100644 --- a/test/io/test_big_schema.py +++ b/test/io/test_big_schema.py @@ -70,3 +70,23 @@ def test_should_not_fail_with_stack_overflow(defaultenv): assert response.status_code == 404 data = response.json() assert data["code"] == "PGRST205" + + +def test_second_request_for_non_existent_table_should_be_quick(defaultenv): + "requesting a non-existent relationship should be quick after the fuzzy search index is loaded (2nd request)" + + env = { + **defaultenv, + "PGRST_DB_SCHEMAS": "apflora", + "PGRST_DB_POOL": "2", + "PGRST_DB_ANON_ROLE": "postgrest_test_anonymous", + } + + with run(env=env, wait_max_seconds=30) as postgrest: + response = postgrest.session.get("/unknown-table") + assert response.status_code == 404 + data = response.json() + assert data["code"] == "PGRST205" + first_duration = response.elapsed.total_seconds() + response = postgrest.session.get("/unknown-table") + assert response.elapsed.total_seconds() < first_duration / 20