From 1d9eef54d8ffd7ad683d53767f30786d2bd3bba8 Mon Sep 17 00:00:00 2001 From: Chad Burt Date: Thu, 9 Nov 2023 15:50:07 -0800 Subject: [PATCH] Performance improvements to duplicate_ip checking When submitting a survey response, we store a hash of the request ip if available to detect repeated entries and potential abuse. IP addresses are hashed to protect privacy but checking hashes is slow, and becomes slower as the number of records grows. This is a fundamental limitation since we must store IP addresses with unique salts to protect privacy. To improve performance, we will now keep a maximum of 1000 hash records and also otherwise age them out after 1 week. --- packages/api/migrations/committed/000280.sql | 70 ++++++++++++++++++++ packages/api/migrations/current.sql | 66 ------------------ packages/api/schema.sql | 40 ++++++++++- 3 files changed, 107 insertions(+), 69 deletions(-) create mode 100644 packages/api/migrations/committed/000280.sql diff --git a/packages/api/migrations/committed/000280.sql b/packages/api/migrations/committed/000280.sql new file mode 100644 index 000000000..23d3d3238 --- /dev/null +++ b/packages/api/migrations/committed/000280.sql @@ -0,0 +1,70 @@ +--! Previous: sha1:6e14c22c9b93c63557203d4e25e5627d0ccf58f6 +--! Hash: sha1:6084ad8c19dffb844ed6a49cab24caac6dc51841 + +-- Enter migration here +alter table survey_response_network_addresses drop column if exists updated_at; +alter table survey_response_network_addresses add column if not exists updated_at integer not null default extract(epoch from date_trunc('hour', now())) + (extract(minute FROM now())::int / 5) * 60 * 5; + +CREATE OR REPLACE FUNCTION public.before_survey_response_insert() RETURNS trigger + LANGUAGE plpgsql SECURITY DEFINER + AS $$ + declare + existing survey_response_network_addresses; + begin + if current_setting('session.request_ip', true) is not null then + -- first, delete the oldest hashes of request IPs so that this + -- process doesn't become too inefficient + delete from + survey_response_network_addresses + where + updated_at <= ( + select + greatest( + -- Two weeks ago, + (extract(epoch from date_trunc('hour', now())) + + (extract(minute FROM now())::int / 5) * 60 * 5) - + 60 * 60 * 24 * 7, + -- Or the thousandth most recent entry updated_at + ( + select + updated_at + from + survey_response_network_addresses + order by + updated_at desc + limit 1 offset 1000 + ) + ) + ); + update + survey_response_network_addresses + set + num_responses = num_responses + 1, + updated_at = extract(epoch from date_trunc('hour', now())) + (extract(minute FROM now())::int / 5) * 60 * 5 + where + ip_hash = crypt( + current_setting('session.request_ip', true) || NEW.survey_id::text, + ip_hash + ) + returning + * + into + existing; + if existing is not null then + NEW.is_duplicate_ip = true; + else + insert into survey_response_network_addresses ( + survey_id, + ip_hash + ) values ( + NEW.survey_id, + crypt( + current_setting('session.request_ip', true) || NEW.survey_id::text, + gen_salt('md5') + ) + ); + end if; + end if; + return NEW; + end; +$$; diff --git a/packages/api/migrations/current.sql b/packages/api/migrations/current.sql index d414520ed..8da533983 100644 --- a/packages/api/migrations/current.sql +++ b/packages/api/migrations/current.sql @@ -1,67 +1 @@ -- Enter migration here -alter table survey_response_network_addresses drop column if exists updated_at; -alter table survey_response_network_addresses add column if not exists updated_at integer not null default extract(epoch from date_trunc('hour', now())) + (extract(minute FROM now())::int / 5) * 60 * 5; - -CREATE OR REPLACE FUNCTION public.before_survey_response_insert() RETURNS trigger - LANGUAGE plpgsql SECURITY DEFINER - AS $$ - declare - existing survey_response_network_addresses; - begin - if current_setting('session.request_ip', true) is not null then - -- first, delete the oldest hashes of request IPs so that this - -- process doesn't become too inefficient - delete from - survey_response_network_addresses - where - updated_at <= ( - select - greatest( - -- Two weeks ago, - (extract(epoch from date_trunc('hour', now())) + - (extract(minute FROM now())::int / 5) * 60 * 5) - - 60 * 60 * 24 * 7, - -- Or the thousandth most recent entry updated_at - ( - select - updated_at - from - survey_response_network_addresses - order by - updated_at desc - limit 1 offset 1000 - ) - ) - ); - update - survey_response_network_addresses - set - num_responses = num_responses + 1, - updated_at = extract(epoch from date_trunc('hour', now())) + (extract(minute FROM now())::int / 5) * 60 * 5 - where - ip_hash = crypt( - current_setting('session.request_ip', true) || NEW.survey_id::text, - ip_hash - ) - returning - * - into - existing; - if existing is not null then - NEW.is_duplicate_ip = true; - else - insert into survey_response_network_addresses ( - survey_id, - ip_hash - ) values ( - NEW.survey_id, - crypt( - current_setting('session.request_ip', true) || NEW.survey_id::text, - gen_salt('md5') - ) - ); - end if; - end if; - return NEW; - end; -$$; diff --git a/packages/api/schema.sql b/packages/api/schema.sql index 5b924bffd..72ec219ae 100644 --- a/packages/api/schema.sql +++ b/packages/api/schema.sql @@ -3512,9 +3512,35 @@ CREATE FUNCTION public.before_survey_response_insert() RETURNS trigger existing survey_response_network_addresses; begin if current_setting('session.request_ip', true) is not null then + -- first, delete the oldest hashes of request IPs so that this + -- process doesn't become too inefficient + delete from + survey_response_network_addresses + where + updated_at <= ( + select + greatest( + -- Two weeks ago, + (extract(epoch from date_trunc('hour', now())) + + (extract(minute FROM now())::int / 5) * 60 * 5) - + 60 * 60 * 24 * 7, + -- Or the thousandth most recent entry updated_at + ( + select + updated_at + from + survey_response_network_addresses + order by + updated_at desc + limit 1 offset 1000 + ) + ) + ); update survey_response_network_addresses - set num_responses = num_responses + 1 + set + num_responses = num_responses + 1, + updated_at = extract(epoch from date_trunc('hour', now())) + (extract(minute FROM now())::int / 5) * 60 * 5 where ip_hash = crypt( current_setting('session.request_ip', true) || NEW.survey_id::text, @@ -3534,7 +3560,7 @@ CREATE FUNCTION public.before_survey_response_insert() RETURNS trigger NEW.survey_id, crypt( current_setting('session.request_ip', true) || NEW.survey_id::text, - gen_salt('bf') + gen_salt('md5') ) ); end if; @@ -14518,7 +14544,8 @@ ALTER TABLE public.survey_invites ALTER COLUMN id ADD GENERATED BY DEFAULT AS ID CREATE TABLE public.survey_response_network_addresses ( survey_id integer NOT NULL, ip_hash text NOT NULL, - num_responses integer DEFAULT 1 NOT NULL + num_responses integer DEFAULT 1 NOT NULL, + updated_at integer DEFAULT (date_part('epoch'::text, date_trunc('hour'::text, now())) + (((((date_part('minute'::text, now()))::integer / 5) * 60) * 5))::double precision) NOT NULL ); @@ -16151,6 +16178,13 @@ CREATE TRIGGER before_survey_delete_trigger BEFORE DELETE ON public.surveys FOR CREATE TRIGGER before_survey_invited_groups_insert_trigger BEFORE INSERT ON public.survey_invited_groups FOR EACH ROW EXECUTE FUNCTION public.before_survey_invited_groups_insert(); +-- +-- Name: survey_responses before_survey_response_insert_trigger; Type: TRIGGER; Schema: public; Owner: - +-- + +CREATE TRIGGER before_survey_response_insert_trigger BEFORE INSERT ON public.survey_responses FOR EACH ROW EXECUTE FUNCTION public.before_survey_response_insert(); + + -- -- Name: surveys before_survey_update_trigger; Type: TRIGGER; Schema: public; Owner: - --