fix node tuned shm param

Vonng · Vonng · commit 5c4a1ba1f08d · 2025-06-20T23:07:15.000+08:00
diff --git a/roles/node/templates/tuned-crit.conf b/roles/node/templates/tuned-crit.conf
@@ -20,25 +20,23 @@ governor=performance
 energy_perf_bias=performance
 min_perf_pct=100
 
-[vm]
-# disable transparent hugepages
-transparent_hugepages=never
-
 [sysctl]
 #-------------------------------------------------------------#
 #                           KERNEL                            #
 #-------------------------------------------------------------#
 # disable numa balancing
 kernel.numa_balancing=0
 
-# total shmem size in bytes: $(expr $(getconf _PHYS_PAGES) /  * 0.75  \* $(getconf PAGE_SIZE))
-# total mem: {{ node_mem_bytes }}
-kernel.shmall = {{ (node_mem_bytes|int * 0.75)|int }}
+# do not schedule postgres process in group
+kernel.sched_autogroup_enabled = 0
+
+# shmall in pages
+kernel.shmall = {{ (node_pages|int * 0.75)|int }}
 
-# total shmax size in pages:  $(expr $(getconf _PHYS_PAGES) * 0.75 )
-kernel.shmax = {{ (node_pages|int * 0.75)|int }}
+# shmmax size in bytes
+kernel.shmmax = {{ (node_mem_bytes|int * 0.75)|int }}
 
-# total shmem segs 4096 -> 8192
+# total shmem segs
 kernel.shmmni=8192
 
 # total msg queue number, set to mem size in MB
@@ -55,27 +53,42 @@ kernel.pid_max=131072
 # max(Sem in Set)=2048, max(Sem)=max(Sem in Set) x max(SemSet) , max(Sem per Ops)=2048, max(SemSet)=65536
 kernel.sem=2048 134217728 2048 65536
 
-# do not sched postgres process in group
-kernel.sched_autogroup_enabled = 0
-
 # total time the scheduler will consider a migrated process cache hot and, thus, less likely to be remigrated
 # defaut = 0.5ms (500000ns), update to 5ms , depending on your typical query (e.g < 1ms)
 kernel.sched_migration_cost_ns=5000000
 
-
 #-------------------------------------------------------------#
 #                             VM                              #
 #-------------------------------------------------------------#
+# disable transparent hugepages
+vm.transparent_hugepages=never
+
 # try not using swap
 vm.swappiness=1
 
 # disable when most mem are for file cache
 vm.zone_reclaim_mode=0
 
-# 64MB mem (2xRAID cache) wake the bgwriter
-vm.dirty_background_bytes=67108864
-# vm.dirty_background_ratio=3       # latency-performance default
-vm.dirty_ratio=10                   # latency-performance default
+{% if node_overcommit_ratio|int > 0 %}
+# overcommit threshhold = {{ node_overcommit_ratio }}%
+vm.overcommit_memory=2
+vm.overcommit_ratio={{ node_overcommit_ratio }}
+{% else %}
+vm.overcommit_memory=0
+vm.overcommit_ratio=100
+{% endif %}
+
+# Starts background writeback of dirty data at this percentage of total memory, via a pdflush daemon. The default value is 10. reduce to 1
+vm.dirty_background_ratio=1
+
+# Starts active writeback of dirty data at this percentage of total memory for the generator of dirty data, via pdflush. The default value is 40. reduce to 5
+vm.dirty_ratio=5
+
+# Defines when dirty in-memory data is old enough to be eligible for writeout. default 30s, reduce to 1s
+vm.dirty_expire_centisecs = 100
+
+# Defines the interval between pdflush daemon wakeups, which periodically writes dirty in-memory data out to disk. The default value is 500, reduce to 1s
+vm.dirty_expire_centisecs = 100
 
 # deny access on 0x00000 - 0x10000
 vm.mmap_min_addr=65536
@@ -93,7 +106,7 @@ fs.aio-max-nr=1048576
 #-------------------------------------------------------------#
 #                          Network                            #
 #-------------------------------------------------------------#
-# max connection in listen queue (triggers retrans if full)
+# max connection in listen queue (triggers re-trans if full)
 net.core.somaxconn=65535
 net.core.netdev_max_backlog=8192
 # tcp receive/transmit buffer default = 256KiB
diff --git a/roles/node/templates/tuned-olap.conf b/roles/node/templates/tuned-olap.conf
@@ -20,25 +20,23 @@ governor=performance
 energy_perf_bias=performance
 min_perf_pct=100
 
-[vm]
-# disable transparent hugepages
-transparent_hugepages=never
-
 [sysctl]
 #-------------------------------------------------------------#
 #                           KERNEL                            #
 #-------------------------------------------------------------#
 # disable numa balancing
 kernel.numa_balancing=0
 
-# total shmem size in bytes: $(expr $(getconf _PHYS_PAGES) /  * 0.75  \* $(getconf PAGE_SIZE))
-# total mem: {{ node_mem_bytes }}
-kernel.shmall = {{ (node_mem_bytes|int * 0.75)|int }}
+# do not schedule postgres process in group
+kernel.sched_autogroup_enabled = 0
+
+# shmall in pages
+kernel.shmall = {{ (node_pages|int * 0.75)|int }}
 
-# total shmax size in pages:  $(expr $(getconf _PHYS_PAGES) * 0.75 )
-kernel.shmax = {{ (node_pages|int * 0.75)|int }}
+# shmmax size in bytes
+kernel.shmmax = {{ (node_mem_bytes|int * 0.75)|int }}
 
-# total shmem segs 4096 -> 8192
+# total shmem segs
 kernel.shmmni=8192
 
 # total msg queue number, set to mem size in MB
@@ -55,16 +53,16 @@ kernel.pid_max=131072
 # max(Sem in Set)=2048, max(Sem)=max(Sem in Set) x max(SemSet) , max(Sem per Ops)=2048, max(SemSet)=65536
 kernel.sem=2048 134217728 2048 65536
 
-# do not sched postgres process in group
-kernel.sched_autogroup_enabled = 0
-
 # total time the scheduler will consider a migrated process cache hot and, thus, less likely to be remigrated
 # defaut = 0.5ms (500000ns), update to 5ms , depending on your typical query (e.g < 1ms)
 kernel.sched_migration_cost_ns=5000000
 
 #-------------------------------------------------------------#
 #                             VM                              #
 #-------------------------------------------------------------#
+# disable transparent hugepages
+vm.transparent_hugepages=never
+
 # try not using swap
 vm.swappiness=1
 
@@ -80,9 +78,17 @@ vm.overcommit_ratio={{ node_overcommit_ratio }}
 #vm.overcommit_ratio=100
 {% endif %}
 
-# allow 90% dirty ratio on OLAP instance
-vm.dirty_background_ratio = 20    # throughput-performance default
-vm.dirty_ratio=90                 # throughput-performance default 40 -> 90
+# Starts background writeback of dirty data at this percentage of total memory, via a pdflush daemon. The default value is 10.
+vm.dirty_background_ratio=10
+
+# Starts active writeback of dirty data at this percentage of total memory for the generator of dirty data, via pdflush. The default value is 40. increase to 80
+vm.dirty_ratio=80
+
+# Defines when dirty in-memory data is old enough to be eligible for writeout. default 30s
+#vm.dirty_expire_centisecs = 3000
+
+# Defines the interval between pdflush daemon wakeups, which periodically writes dirty in-memory data out to disk. The default value is 500
+#vm.dirty_expire_centisecs = 100
 
 # deny access on 0x00000 - 0x10000
 vm.mmap_min_addr=65536
@@ -101,7 +107,7 @@ fs.aio-max-nr=1048576
 #-------------------------------------------------------------#
 #                          Network                            #
 #-------------------------------------------------------------#
-# max connection in listen queue (triggers retrans if full)
+# max connection in listen queue (triggers re-trans if full)
 net.core.somaxconn=65535
 net.core.netdev_max_backlog=8192
 # tcp receive/transmit buffer default = 256KiB
diff --git a/roles/node/templates/tuned-oltp.conf b/roles/node/templates/tuned-oltp.conf
@@ -20,25 +20,23 @@ governor=performance
 energy_perf_bias=performance
 min_perf_pct=100
 
-[vm]
-# disable transparent hugepages
-transparent_hugepages=never
-
 [sysctl]
 #-------------------------------------------------------------#
 #                           KERNEL                            #
 #-------------------------------------------------------------#
 # disable numa balancing
 kernel.numa_balancing=0
 
-# total shmem size in bytes: $(expr $(getconf _PHYS_PAGES) /  * 0.75  \* $(getconf PAGE_SIZE))
-# total mem: {{ node_mem_bytes }}
-kernel.shmall = {{ (node_mem_bytes|int * 0.75)|int }}
+# do not schedule postgres process in group
+kernel.sched_autogroup_enabled = 0
+
+# shmall in pages
+kernel.shmall = {{ (node_pages|int * 0.75)|int }}
 
-# total shmax size in pages:  $(expr $(getconf _PHYS_PAGES) * 0.75 )
-kernel.shmax = {{ (node_pages|int * 0.75)|int }}
+# shmmax size in bytes
+kernel.shmmax = {{ (node_mem_bytes|int * 0.75)|int }}
 
-# total shmem segs 4096 -> 8192
+# total shmem segs
 kernel.shmmni=8192
 
 # total msg queue number, set to mem size in MB
@@ -55,16 +53,16 @@ kernel.pid_max=131072
 # max(Sem in Set)=2048, max(Sem)=max(Sem in Set) x max(SemSet) , max(Sem per Ops)=2048, max(SemSet)=65536
 kernel.sem=2048 134217728 2048 65536
 
-# do not sched postgres process in group
-kernel.sched_autogroup_enabled = 0
-
 # total time the scheduler will consider a migrated process cache hot and, thus, less likely to be remigrated
 # defaut = 0.5ms (500000ns), update to 5ms , depending on your typical query (e.g < 1ms)
 kernel.sched_migration_cost_ns=5000000
 
 #-------------------------------------------------------------#
 #                             VM                              #
 #-------------------------------------------------------------#
+# disable transparent hugepages
+vm.transparent_hugepages=never
+
 # try not using swap
 vm.swappiness=1
 
@@ -80,9 +78,17 @@ vm.overcommit_memory=0
 vm.overcommit_ratio=100
 {% endif %}
 
-# vm.dirty_background_bytes=67108864 # 64MB mem (2xRAID cache) wake the bgwriter
-vm.dirty_background_ratio=3       # latency-performance default
-vm.dirty_ratio=30                 # latency-performance default
+# Starts background writeback of dirty data at this percentage of total memory, via a pdflush daemon. The default value is 10.
+vm.dirty_background_ratio=3
+
+# Starts active writeback of dirty data at this percentage of total memory for the generator of dirty data, via pdflush. The default value is 40.
+vm.dirty_ratio=15
+
+# Defines when dirty in-memory data is old enough to be eligible for writeout. default 30s, reduce to 5s
+vm.dirty_expire_centisecs = 500
+
+# Defines the interval between pdflush daemon wakeups, which periodically writes dirty in-memory data out to disk. The default value is 500, reduce to 1s
+vm.dirty_expire_centisecs = 100
 
 # deny access on 0x00000 - 0x10000
 vm.mmap_min_addr=65536
@@ -101,7 +107,7 @@ fs.aio-max-nr=1048576
 #-------------------------------------------------------------#
 #                          Network                            #
 #-------------------------------------------------------------#
-# max connection in listen queue (triggers retrans if full)
+# max connection in listen queue (triggers re-trans if full)
 net.core.somaxconn=65535
 net.core.netdev_max_backlog=8192
 # tcp receive/transmit buffer default = 256KiB
diff --git a/roles/node/templates/tuned-tiny.conf b/roles/node/templates/tuned-tiny.conf
@@ -11,45 +11,39 @@
 #==============================================================#
 
 [main]
-summary=Optimize for PostgreSQL TINY System
+summary=Optimize for TINY System
 # include=virtual-guest
 
-[vm]
-# disable transparent hugepages
-transparent_hugepages=never
-
 [sysctl]
 #-------------------------------------------------------------#
 #                           KERNEL                            #
 #-------------------------------------------------------------#
 # disable numa balancing
 kernel.numa_balancing=0
 
-# total shmem size in bytes: $(expr $(getconf _PHYS_PAGES) /  * 0.75  \* $(getconf PAGE_SIZE))
-# total mem: {{ node_mem_bytes }}
-kernel.shmall = {{ (node_mem_bytes|int * 0.75)|int }}
+# do not schedule postgres process in group
+kernel.sched_autogroup_enabled = 0
 
-# total shmax size in pages:  $(expr $(getconf _PHYS_PAGES) * 0.75 )
-kernel.shmax = {{ (node_pages|int * 0.75)|int }}
+# shmall in pages
+kernel.shmall = {{ (node_pages|int * 0.75)|int }}
 
-# If a workload mostly uses anonymous memory and it hits this limit, the entire
-# working set is buffered for I/O, and any more write buffering would require
-# swapping, so it's time to throttle writes until I/O can catch up.  Workloads
-# that mostly use file mappings may be able to use even higher values.
-#
-# The generator of dirty data starts writeback at this percentage (system default
-# is 20%)
-vm.dirty_ratio = 50
+# shmmax size in bytes
+kernel.shmmax = {{ (node_mem_bytes|int * 0.75)|int }}
+
+# total shmem segs
+kernel.shmmni=8192
 
-# Filesystem I/O is usually much more efficient than swapping, so try to keep
-# swapping low.  It's usually safe to go even lower than this on systems with
-# server-grade storage.
-vm.swappiness = 10
 
 
 #-------------------------------------------------------------#
 #                             VM                              #
 #-------------------------------------------------------------#
+# disable transparent hugepages
+vm.transparent_hugepages=never
+
+# try not using swap
+vm.swappiness=10
+
 {% if node_overcommit_ratio|int > 0 %}
 # overcommit threshhold = {{ node_overcommit_ratio }}%
 vm.overcommit_memory=2
@@ -59,6 +53,10 @@ vm.overcommit_ratio={{ node_overcommit_ratio }}
 #vm.overcommit_ratio=100
 {% endif %}
 
+
+# deny access on 0x00000 - 0x10000
+vm.mmap_min_addr=65536
+
 #-------------------------------------------------------------#
 #                          Network                            #
 #-------------------------------------------------------------#