From 76fca021ae69c55c17833234fb3d56dcc07c2900 Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Mon, 25 Aug 2025 12:34:37 +0200 Subject: [PATCH 1/2] Celery: add extra broker settings - `socket_timeout`: match visibility timeout to avoid the socket being closed during this period of time. - `retry_on_timeout`: allow the worker to retry the communication if the timeout was reached. - `health_check_interval`: check the connection and re-connect if it's lost. This should help with the issue of all workers being disconnected when Redis instance reaches OOM (pidbox keys). --- readthedocs/settings/base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/readthedocs/settings/base.py b/readthedocs/settings/base.py index df312568226..37361e1ba25 100644 --- a/readthedocs/settings/base.py +++ b/readthedocs/settings/base.py @@ -641,6 +641,9 @@ def BUILD_MEMORY_LIMIT(self): # https://docs.celeryq.dev/en/stable/getting-started/backends-and-brokers/redis.html#visibility-timeout BROKER_TRANSPORT_OPTIONS = { 'visibility_timeout': BUILD_TIME_LIMIT * 1.15, # 15% more than the build time limit + 'socket_timeout': BUILD_TIME_LIMIT * 1.15, + 'retry_on_timeout': True, + 'health_check_interval': 30, } CELERY_DEFAULT_QUEUE = "celery" From 8da330721ebc24a71ef94e17ca5b9ab767e004f0 Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Tue, 26 Aug 2025 20:49:28 +0200 Subject: [PATCH 2/2] Increase visibility timeout to 5hs --- readthedocs/settings/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/readthedocs/settings/base.py b/readthedocs/settings/base.py index 37361e1ba25..febb0809023 100644 --- a/readthedocs/settings/base.py +++ b/readthedocs/settings/base.py @@ -640,8 +640,8 @@ def BUILD_MEMORY_LIMIT(self): # https://github.com/readthedocs/readthedocs.org/issues/12317#issuecomment-3070950434 # https://docs.celeryq.dev/en/stable/getting-started/backends-and-brokers/redis.html#visibility-timeout BROKER_TRANSPORT_OPTIONS = { - 'visibility_timeout': BUILD_TIME_LIMIT * 1.15, # 15% more than the build time limit - 'socket_timeout': BUILD_TIME_LIMIT * 1.15, + 'visibility_timeout': 5 * 60 * 60, # Use 5hs to cover the longest build just in case + 'socket_timeout': 5 * 60 * 60, 'retry_on_timeout': True, 'health_check_interval': 30, }