Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions src/PostgREST/Error.hs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ import Network.HTTP.Types.Header (Header)
import PostgREST.MediaType (MediaType (..))
import qualified PostgREST.MediaType as MediaType

import PostgREST.SchemaCache (TablesFuzzyIndex)
import PostgREST.SchemaCache.Identifiers (QualifiedIdentifier (..),
Schema)
import PostgREST.SchemaCache.Relationship (Cardinality (..),
Expand All @@ -49,10 +50,8 @@ import PostgREST.SchemaCache.Relationship (Cardinality (..),
RelationshipsMap)
import PostgREST.SchemaCache.Routine (Routine (..),
RoutineParam (..))
import PostgREST.SchemaCache.Table (Table (..))
import Protolude


class (ErrorBody a, JSON.ToJSON a) => PgrstError a where
status :: a -> HTTP.Status
headers :: a -> [Header]
Expand Down Expand Up @@ -250,7 +249,7 @@ data SchemaCacheError
| NoRelBetween Text Text (Maybe Text) Text RelationshipsMap
| NoRpc Text Text [Text] MediaType Bool [QualifiedIdentifier] [Routine]
| ColumnNotFound Text Text
| TableNotFound Text Text [Table]
| TableNotFound Text Text TablesFuzzyIndex
deriving Show

instance PgrstError SchemaCacheError where
Expand Down Expand Up @@ -428,12 +427,12 @@ noRpcHint schema procName params allProcs overloadedProcs =

-- |
-- Do a fuzzy search in all tables in the same schema and return closest result
tableNotFoundHint :: Text -> Text -> [Table] -> Maybe Text
tableNotFoundHint schema tblName tblList
tableNotFoundHint :: Text -> Text -> TablesFuzzyIndex -> Maybe Text
tableNotFoundHint schema tblName dbTablesFuzzyIndex
= fmap (\tbl -> "Perhaps you meant the table '" <> schema <> "." <> tbl <> "'") perhapsTable
where
perhapsTable = Fuzzy.getOne fuzzyTableSet tblName
fuzzyTableSet = Fuzzy.fromList [ tableName tbl | tbl <- tblList, tableSchema tbl == schema]
fuzzyTableSet = fromMaybe Fuzzy.defaultSet (HM.lookup schema dbTablesFuzzyIndex)


compressedRel :: Relationship -> JSON.Value
Expand Down
12 changes: 6 additions & 6 deletions src/PostgREST/Plan.hs
Original file line number Diff line number Diff line change
Expand Up @@ -172,15 +172,15 @@ dbActionPlan dbAct conf apiReq sCache = case dbAct of

wrappedReadPlan :: QualifiedIdentifier -> AppConfig -> SchemaCache -> ApiRequest -> Bool -> Either Error CrudPlan
wrappedReadPlan identifier conf sCache apiRequest@ApiRequest{iPreferences=Preferences{..},..} headersOnly = do
qi <- findTable identifier (dbTables sCache)
qi <- findTable identifier sCache
rPlan <- readPlan qi conf sCache apiRequest
(handler, mediaType) <- mapLeft ApiRequestError $ negotiateContent conf apiRequest qi iAcceptMediaType (dbMediaHandlers sCache) (hasDefaultSelect rPlan)
if not (null invalidPrefs) && preferHandling == Just Strict then Left $ ApiRequestError $ InvalidPreferences invalidPrefs else Right ()
return $ WrappedReadPlan rPlan SQL.Read handler mediaType headersOnly qi

mutateReadPlan :: Mutation -> ApiRequest -> QualifiedIdentifier -> AppConfig -> SchemaCache -> Either Error CrudPlan
mutateReadPlan mutation apiRequest@ApiRequest{iPreferences=Preferences{..},..} identifier conf sCache = do
qi <- findTable identifier (dbTables sCache)
qi <- findTable identifier sCache
rPlan <- readPlan qi conf sCache apiRequest
mPlan <- mutatePlan mutation qi apiRequest sCache rPlan
if not (null invalidPrefs) && preferHandling == Just Strict then Left $ ApiRequestError $ InvalidPreferences invalidPrefs else Right ()
Expand Down Expand Up @@ -810,10 +810,10 @@ validateAggFunctions aggFunctionsAllowed (Node rp@ReadPlan {select} forest)
| otherwise = Node rp <$> traverse (validateAggFunctions aggFunctionsAllowed) forest

-- | Lookup table in the schema cache before creating read plan
findTable :: QualifiedIdentifier -> TablesMap -> Either Error QualifiedIdentifier
findTable qi@QualifiedIdentifier{..} tableMap =
case HM.lookup qi tableMap of
Nothing -> Left $ SchemaCacheErr $ TableNotFound qiSchema qiName (HM.elems tableMap)
findTable :: QualifiedIdentifier -> SchemaCache -> Either Error QualifiedIdentifier
findTable qi@QualifiedIdentifier{..} SchemaCache{dbTables, dbTablesFuzzyIndex} =
case HM.lookup qi dbTables of
Nothing -> Left $ SchemaCacheErr $ TableNotFound qiSchema qiName dbTablesFuzzyIndex
Just _ -> Right qi

addFilters :: ResolverContext -> ApiRequest -> ReadPlanTree -> Either Error ReadPlanTree
Expand Down
4 changes: 2 additions & 2 deletions src/PostgREST/Response.hs
Original file line number Diff line number Diff line change
Expand Up @@ -209,10 +209,10 @@ actionResponse (MaybeDbResult InspectPlan{ipHdrsOnly=headersOnly} body) _ versio
in
Right $ PgrstResponse HTTP.status200 (MediaType.toContentType MTOpenAPI : cLHeader ++ maybeToList (profileHeader schema negotiatedByProfile)) rsBody

actionResponse (NoDbResult (RelInfoPlan qi@QualifiedIdentifier{..})) _ _ _ SchemaCache{dbTables} _ _ =
actionResponse (NoDbResult (RelInfoPlan qi@QualifiedIdentifier{..})) _ _ _ SchemaCache{dbTables, dbTablesFuzzyIndex} _ _ =
case HM.lookup qi dbTables of
Just tbl -> respondInfo $ allowH tbl
Nothing -> Left $ Error.SchemaCacheErr $ Error.TableNotFound qiSchema qiName (HM.elems dbTables)
Nothing -> Left $ Error.SchemaCacheErr $ Error.TableNotFound qiSchema qiName dbTablesFuzzyIndex
where
allowH table =
let hasPK = not . null $ tablePKCols table in
Expand Down
33 changes: 22 additions & 11 deletions src/PostgREST/SchemaCache.hs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ These queries are executed once at startup or when PostgREST is reloaded.

module PostgREST.SchemaCache
( SchemaCache(..)
, TablesFuzzyIndex
, querySchemaCache
, showSummary
, decodeFuncs
Expand Down Expand Up @@ -66,21 +67,28 @@ import PostgREST.SchemaCache.Table (Column (..), ColumnMap,

import qualified PostgREST.MediaType as MediaType

import Control.Arrow ((&&&))
import Protolude
import System.IO.Unsafe (unsafePerformIO)
import Control.Arrow ((&&&))
import qualified Data.FuzzySet as Fuzzy
import Protolude
import System.IO.Unsafe (unsafePerformIO)

type TablesFuzzyIndex = HM.HashMap Schema Fuzzy.FuzzySet

data SchemaCache = SchemaCache
{ dbTables :: TablesMap
, dbRelationships :: RelationshipsMap
, dbRoutines :: RoutineMap
, dbRepresentations :: RepresentationsMap
, dbMediaHandlers :: MediaHandlerMap
, dbTimezones :: TimezoneNames
{ dbTables :: TablesMap
, dbRelationships :: RelationshipsMap
, dbRoutines :: RoutineMap
, dbRepresentations :: RepresentationsMap
, dbMediaHandlers :: MediaHandlerMap
, dbTimezones :: TimezoneNames
-- Fuzzy index of table names per schema to support approximate matching
-- Since index construction can be expensive, we build it once and store in the SchemaCache
-- Haskell lazy evaluation ensures it's only built on first use and memoized afterwards
, dbTablesFuzzyIndex :: TablesFuzzyIndex
}

instance JSON.ToJSON SchemaCache where
toJSON (SchemaCache tabs rels routs reps hdlers tzs) = JSON.object [
toJSON (SchemaCache tabs rels routs reps hdlers tzs _) = JSON.object [
"dbTables" .= JSON.toJSON tabs
, "dbRelationships" .= JSON.toJSON rels
, "dbRoutines" .= JSON.toJSON routs
Expand All @@ -90,7 +98,7 @@ instance JSON.ToJSON SchemaCache where
]

showSummary :: SchemaCache -> Text
showSummary (SchemaCache tbls rels routs reps mediaHdlrs tzs) =
showSummary (SchemaCache tbls rels routs reps mediaHdlrs tzs _) =
T.intercalate ", "
[ show (HM.size tbls) <> " Relations"
, show (HM.size rels) <> " Relationships"
Expand Down Expand Up @@ -166,6 +174,8 @@ querySchemaCache conf@AppConfig{..} = do
, dbRepresentations = reps
, dbMediaHandlers = HM.union mHdlers initialMediaHandlers -- the custom handlers will override the initial ones
, dbTimezones = tzones

, dbTablesFuzzyIndex = Fuzzy.fromList <$> HM.fromListWith (<>) ((qiSchema &&& pure . qiName) <$> HM.keys tabsWViewsPks)
}
where
schemas = toList configDbSchemas
Expand Down Expand Up @@ -203,6 +213,7 @@ removeInternal schemas dbStruct =
, dbRepresentations = dbRepresentations dbStruct -- no need to filter, not directly exposed through the API
, dbMediaHandlers = dbMediaHandlers dbStruct
, dbTimezones = dbTimezones dbStruct
, dbTablesFuzzyIndex = dbTablesFuzzyIndex dbStruct
}
where
hasInternalJunction ComputedRelationship{} = False
Expand Down
18 changes: 18 additions & 0 deletions test/io/big_schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -11375,6 +11375,24 @@ ALTER TABLE ONLY apflora.zielber

ALTER TABLE apflora."user" ENABLE ROW LEVEL SECURITY;

-- Create many tables to test fuzzy string search
-- computing hints for non existing tables
DO
$$
DECLARE
r record;
BEGIN
FOR r IN
SELECT
format('CREATE TABLE apflora.unknown_table_%s ()', n) AS ct
FROM
generate_series(1, 950) n
LOOP
EXECUTE r.ct;
END LOOP;
END
$$;

DROP ROLE IF EXISTS postgrest_test_anonymous;
CREATE ROLE postgrest_test_anonymous;

Expand Down
20 changes: 20 additions & 0 deletions test/io/test_big_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,23 @@ def test_should_not_fail_with_stack_overflow(defaultenv):
assert response.status_code == 404
data = response.json()
assert data["code"] == "PGRST205"


def test_second_request_for_non_existent_table_should_be_quick(defaultenv):
"requesting a non-existent relationship should be quick after the fuzzy search index is loaded (2nd request)"

env = {
**defaultenv,
"PGRST_DB_SCHEMAS": "apflora",
"PGRST_DB_POOL": "2",
"PGRST_DB_ANON_ROLE": "postgrest_test_anonymous",
}

with run(env=env, wait_max_seconds=30) as postgrest:
response = postgrest.session.get("/unknown-table")
assert response.status_code == 404
data = response.json()
assert data["code"] == "PGRST205"
first_duration = response.elapsed.total_seconds()
response = postgrest.session.get("/unknown-table")
assert response.elapsed.total_seconds() < first_duration / 20
Comment on lines +85 to +92
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I noticed something strange on my system. When I run the test with get endpoint changed to /table-with-a-weird-name, the test fails with an exception.

FAILED test/io/test_big_schema.py::test_second_request_for_non_existent_table_should_be_quick 
- requests.exceptions.ConnectionError: ('Connection aborted.', RemoteDisconnected(
- 'Remote end closed connection without response'))

Copy link
Contributor Author

@mkleczek mkleczek Nov 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@taimoorzaeem I'm afraid the same thing happens when running this test against main.

This is expected as this PR only makes sure the FuzzySet is created once instead of every time hint is calculated. Hint calculation logic and the data structures stay the same.

IMHO it looks like the library we use for fuzzy search has reliability issues and we should look for other solutions.

Copy link
Contributor Author

@mkleczek mkleczek Nov 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@taimoorzaeem I found https://hackage-content.haskell.org/package/fuzzily-0.2.1.0/docs/Text-Fuzzily.html and https://hackage.haskell.org/package/fuzzyfind-3.0.2/docs/Text-FuzzyFind.html but they both implement online fuzzy search. For us means that hint calculation time is at least linear in the number of tables (which I don't think is a good idea as it will almost certainly fail with timeout for large schemas).

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the most scalable solution would be to implement the SymSpell algorithm. See: ref1, ref2, ref3.

I think a haskell package for this would be great for the entire haskell community.

Copy link
Contributor Author

@mkleczek mkleczek Nov 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the most scalable solution would be to implement the SymSpell algorithm. See: ref1, ref2, ref3.

The problem with SymSpell is that it is memory hungry (ie. for 100,000 words and edit distance of 2 it requires 1,500,000 entries in the dictionary). Building the index is also very costly.
You can minimize memory requirements with perfect hashing but it makes building the index even more costly.

In general, this is a tough problem. It is even tougher if the spelling dictionary is dynamic (relation names in the schema cache can be reloaded and require index rebuilding).

I think a haskell package for this would be great for the entire haskell community.

Undoubtedly. You can read about SOTA techniques for example here: https://towardsdatascience.com/spelling-correction-how-to-make-an-accurate-and-fast-corrector-dc6d0bcbba5f/
Implementing this in Haskell would be a very interesting task.

Having said that, when you think about the whole architecture from the high level, PostgREST does not seem to be the right place to implement in-memory text search engine. Especially that it is a component that is supposed to be scaled easily (ie. start new instances quickly). In such scenarios you want to externalize state, ie. have a separate component implementing text search algorithms. But you already have such a component: PostgreSQL itself!

IMHO these are possible paths for PostgREST:

  1. Stay with current architecture (ie. query building based on in-memory schema cache outside of the database) and implement limited but cheap spell checking.
  2. Move query building to the database itself and use database facilities to handle spell checking (somewhat not a viable solution as it would be a complete rewrite).
  3. Stay with current architecture and externalize spell checking
    • by invoking external system when calculating hint (there are multiple questions here: should it be some specialized search engine - we don't want to have such a dependency. should it be PostgreSQL - then see below)
    • by letting PostgreSQL handle misspelled relations (that in essence means reverting fix: handle queries on non-existing table gracefully #3869)

It just seems to me we've just hit the wall with our current architecture here and the only thing we can do is to admit it and live with deficiency in spell checking or re-architect and rewrite PostgREST.

One mitigation would be to work on fuzzyset library and try to optimize it as much as possible (I've taken a quick look at the source and found some minor optimization opportunities). The question is really: what schema sizes are "normal" for PostgREST and what sizes are outside of what PostgREST supports?

@taimoorzaeem @steve-chavez does the above make sense?