From 538d466910f55badbad24c6ce0c7189133eb7695 Mon Sep 17 00:00:00 2001 From: Jorgen Lundman Date: Tue, 31 Oct 2023 10:51:17 +0900 Subject: [PATCH] Update kmem to latest macOS Fix the memory pressure thread to set the pressure variable, as well as relax them on timeout (no pressure) Update abdos.c to actually allocate abd from the expected cache, so reaping works. Signed-off-by: Jorgen Lundman --- include/os/windows/spl/sys/debug.h | 5 + include/os/windows/spl/sys/kmem.h | 4 +- include/os/windows/spl/sys/kmem_impl.h | 3 +- include/os/windows/spl/sys/seg_kmem.h | 3 +- include/os/windows/spl/sys/vmem.h | 6 +- include/os/windows/spl/sys/vmem_impl.h | 2 + module/os/windows/spl/spl-kmem.c | 642 +++++++++++++++++----- module/os/windows/spl/spl-kstat.c | 2 +- module/os/windows/spl/spl-seg_kmem.c | 49 +- module/os/windows/spl/spl-vmem.c | 730 ++++++++++--------------- module/os/windows/zfs/abd_os.c | 386 +++++++++++-- module/os/windows/zfs/arc_os.c | 8 + 12 files changed, 1216 insertions(+), 624 deletions(-) diff --git a/include/os/windows/spl/sys/debug.h b/include/os/windows/spl/sys/debug.h index 86a72df1e2bf..541bf0555ddd 100644 --- a/include/os/windows/spl/sys/debug.h +++ b/include/os/windows/spl/sys/debug.h @@ -91,6 +91,11 @@ #endif +// cdefs.h +#ifndef __DECONST +#define __DECONST(type, var) ((type)(uintptr_t)(const void *)(var)) +#endif + extern void _Noreturn panic(const char *fmt, ...); diff --git a/include/os/windows/spl/sys/kmem.h b/include/os/windows/spl/sys/kmem.h index 412db7600ead..07ca684ca65e 100644 --- a/include/os/windows/spl/sys/kmem.h +++ b/include/os/windows/spl/sys/kmem.h @@ -72,7 +72,7 @@ extern uint64_t physmem; void *zfs_kmem_alloc(size_t size, int kmflags); void *zfs_kmem_zalloc(size_t size, int kmflags); -void zfs_kmem_free(void *buf, size_t size); +void zfs_kmem_free(const void *buf, size_t size); void spl_kmem_init(uint64_t); void spl_kmem_thread_init(); @@ -128,7 +128,7 @@ kmem_cache_t *kmem_cache_create(char *name, size_t bufsize, size_t align, void *_private, struct vmem *vmp, int cflags); void kmem_cache_destroy(kmem_cache_t *cache); void *kmem_cache_alloc(kmem_cache_t *cache, int flags); -void kmem_cache_free(kmem_cache_t *cache, void *buf); +void kmem_cache_free(kmem_cache_t *cache, const void *buf); void kmem_cache_free_to_slab(kmem_cache_t *cache, void *buf); void kmem_cache_reap_now(kmem_cache_t *cache); void kmem_depot_ws_zero(kmem_cache_t *cache); diff --git a/include/os/windows/spl/sys/kmem_impl.h b/include/os/windows/spl/sys/kmem_impl.h index 50d32cc74cac..98593d3601ee 100644 --- a/include/os/windows/spl/sys/kmem_impl.h +++ b/include/os/windows/spl/sys/kmem_impl.h @@ -372,6 +372,7 @@ struct kmem_cache { uint64_t cache_bufmax; /* max buffers ever */ uint64_t cache_bufslab; /* buffers free in slab layer */ uint64_t cache_reap; /* cache reaps */ + kmutex_t cache_reap_lock; /* one reap at a time */ uint64_t cache_rescale; /* hash table rescales */ uint64_t cache_lookup_depth; /* hash lookup depth */ uint64_t cache_depot_contention; /* mutex contention count */ @@ -464,7 +465,7 @@ typedef struct kmem_log_header { kmutex_t lh_lock; char *lh_base; uint32_t *lh_free; - uint32_t lh_chunksize; + size_t lh_chunksize; uint32_t lh_nchunks; uint32_t lh_head; uint32_t lh_tail; diff --git a/include/os/windows/spl/sys/seg_kmem.h b/include/os/windows/spl/sys/seg_kmem.h index a4784b2fc0de..728ba1bb426c 100644 --- a/include/os/windows/spl/sys/seg_kmem.h +++ b/include/os/windows/spl/sys/seg_kmem.h @@ -42,6 +42,7 @@ extern "C" { extern uint64_t segkmem_total_allocated; extern vmem_t *abd_arena; +extern vmem_t *abd_subpage_arena; /* * segkmem page vnodes @@ -54,7 +55,7 @@ extern vmem_t *abd_arena; #endif /* __sparc */ void *segkmem_alloc(vmem_t *, size_t, int); -extern void segkmem_free(vmem_t *, void *, size_t); +extern void segkmem_free(vmem_t *, const void *, size_t); extern void kernelheap_init(void); extern void kernelheap_fini(void); extern void *segkmem_zio_alloc(vmem_t *, size_t, int); diff --git a/include/os/windows/spl/sys/vmem.h b/include/os/windows/spl/sys/vmem.h index e24d7c53c729..fb45c027a796 100644 --- a/include/os/windows/spl/sys/vmem.h +++ b/include/os/windows/spl/sys/vmem.h @@ -125,7 +125,7 @@ struct vmem; typedef struct vmem vmem_t; typedef void *(vmem_alloc_t)(vmem_t *, size_t, int); -typedef void (vmem_free_t)(vmem_t *, void *, size_t); +typedef void (vmem_free_t)(vmem_t *, const void *, size_t); /* * Alternate import style; the requested size is passed in a pointer, @@ -151,8 +151,8 @@ extern void vmem_destroy(vmem_t *); extern void *vmem_alloc_impl(vmem_t *, size_t, int); extern void *vmem_xalloc(vmem_t *, size_t, size_t, size_t, size_t, void *, void *, int); -extern void vmem_free_impl(vmem_t *, void *, size_t); -extern void vmem_xfree(vmem_t *, void *, size_t); +extern void vmem_free_impl(vmem_t *, const void *, size_t); +extern void vmem_xfree(vmem_t *, const void *, size_t); extern void *vmem_add(vmem_t *, void *, size_t, int); extern int vmem_contains(vmem_t *, void *, size_t); extern void vmem_walk(vmem_t *, int, void (*)(void *, void *, size_t), diff --git a/include/os/windows/spl/sys/vmem_impl.h b/include/os/windows/spl/sys/vmem_impl.h index fc52a0f3b890..45db4d47b9e5 100644 --- a/include/os/windows/spl/sys/vmem_impl.h +++ b/include/os/windows/spl/sys/vmem_impl.h @@ -114,6 +114,8 @@ typedef struct vmem_kstat { kstat_named_t vk_parent_free; /* called the source free function */ kstat_named_t vk_threads_waiting; /* threads in cv_wait in vmem */ kstat_named_t vk_excess; /* count of retained excess imports */ + kstat_named_t vk_lowest_stack; /* least remaining stack seen */ + kstat_named_t vk_async_stack_calls; /* times allocated off-thread */ } vmem_kstat_t; struct vmem { diff --git a/module/os/windows/spl/spl-kmem.c b/module/os/windows/spl/spl-kmem.c index eaf417d07a07..88dd75832633 100644 --- a/module/os/windows/spl/spl-kmem.c +++ b/module/os/windows/spl/spl-kmem.c @@ -24,7 +24,7 @@ * Copyright (C) 2008 MacZFS * Copyright (C) 2013, 2020 Jorgen Lundman * Copyright (C) 2014 Brendon Humphrey - * Copyright (C) 2017 Sean Doran + * Copyright (C) 2017, 2021, 2023 Sean Doran * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Portions Copyright 2022 Andrew Innes * @@ -71,8 +71,7 @@ const unsigned int spl_vm_page_free_min = 3500; static kcondvar_t spl_free_thread_cv; static kmutex_t spl_free_thread_lock; static boolean_t spl_free_thread_exit; -static volatile _Atomic int64_t spl_free; -int64_t spl_free_delta_ema; +static volatile _Atomic int64_t spl_free = 0; static boolean_t spl_event_thread_exit = FALSE; PKEVENT low_mem_event = NULL; @@ -82,11 +81,33 @@ static volatile _Atomic boolean_t spl_free_fast_pressure = FALSE; static _Atomic bool spl_free_maybe_reap_flag = false; static _Atomic uint64_t spl_free_last_pressure = 0; +uint64_t spl_enforce_memory_caps = 1; +_Atomic uint64_t spl_dynamic_memory_cap = 0; +hrtime_t spl_dynamic_memory_cap_last_downward_adjust = 0; +uint64_t spl_dynamic_memory_cap_skipped = 0; +kmutex_t spl_dynamic_memory_cap_lock; +uint64_t spl_dynamic_memory_cap_reductions = 0; +uint64_t spl_dynamic_memory_cap_hit_floor = 0; +static uint64_t spl_manual_memory_cap = 0; +static uint64_t spl_memory_cap_enforcements = 0; + +extern void spl_set_arc_no_grow(int); + +/* + * variables informed by "pure" mach_vm_pressure interface + * + * osfmk/vm/vm_pageout.c: "We don't need fully + * accurate monitoring anyway..." + * + * but in macOS_pure we do want modifications of these + * variables to be seen by all the other threads + * consistently, and asap (there may be hundreds + * of simultaneous readers, even if few writers!) + */ _Atomic uint32_t spl_vm_pages_reclaimed = 0; _Atomic uint32_t spl_vm_pages_wanted = 0; _Atomic uint32_t spl_vm_pressure_level = 0; - /* * the spl_pressure_level enum only goes to four, * but we want to watch kstat for whether @@ -116,7 +137,7 @@ void read_random(void *buffer, uint_t numbytes); // the kmem module is preparing to unload. static int shutting_down = 0; -// Amount of RAM in machine +// Amount of RAM PAGES in machine uint64_t physmem = 0; // Size in bytes of the memory allocated in seg_kmem @@ -431,8 +452,8 @@ for (_e = &_s[(count) - 1]; _e > _s; _e--) \ struct { hrtime_t kmp_timestamp; /* timestamp of panic */ int kmp_error; /* type of kmem error */ - void *kmp_buffer; /* buffer that induced panic */ - void *kmp_realbuf; /* real start address for buffer */ + const void *kmp_buffer; /* buffer that induced panic */ + const void *kmp_realbuf; /* real start address for buffer */ kmem_cache_t *kmp_cache; /* buffer's cache according to client */ kmem_cache_t *kmp_realcache; /* actual cache containing buffer */ kmem_slab_t *kmp_slab; /* slab accoring to kmem_findslab() */ @@ -440,9 +461,15 @@ struct { } kmem_panic_info; extern uint64_t stat_osif_malloc_success; +extern uint64_t stat_osif_malloc_fail; extern uint64_t stat_osif_malloc_bytes; extern uint64_t stat_osif_free; extern uint64_t stat_osif_free_bytes; +extern uint64_t stat_osif_malloc_sub128k; +extern uint64_t stat_osif_malloc_sub64k; +extern uint64_t stat_osif_malloc_sub32k; +extern uint64_t stat_osif_malloc_page; +extern uint64_t stat_osif_malloc_subpage; extern uint64_t spl_bucket_non_pow2_allocs; @@ -462,20 +489,14 @@ extern uint64_t spl_vmem_conditional_alloc_bytes; extern uint64_t spl_vmem_conditional_alloc_deny; extern uint64_t spl_vmem_conditional_alloc_deny_bytes; -extern uint64_t spl_xat_success; -extern uint64_t spl_xat_late_success; -extern uint64_t spl_xat_late_success_nosleep; extern uint64_t spl_xat_pressured; -extern uint64_t spl_xat_bailed; -extern uint64_t spl_xat_bailed_contended; extern uint64_t spl_xat_lastalloc; extern uint64_t spl_xat_lastfree; -extern uint64_t spl_xat_forced; extern uint64_t spl_xat_sleep; -extern uint64_t spl_xat_late_deny; -extern uint64_t spl_xat_no_waiters; -extern uint64_t spl_xft_wait; +extern uint64_t spl_vba_fastpath; +extern uint64_t spl_vba_fastexit; +extern uint64_t spl_vba_slowpath; extern uint64_t spl_vba_parent_memory_appeared; extern uint64_t spl_vba_parent_memory_blocked; extern uint64_t spl_vba_hiprio_blocked; @@ -507,6 +528,7 @@ uint64_t kmem_free_to_slab_when_fragmented = 0; extern _Atomic uint64_t spl_lowest_vdev_disk_stack_remaining; extern _Atomic uint64_t spl_lowest_zvol_stack_remaining; extern _Atomic uint64_t spl_lowest_alloc_stack_remaining; +extern unsigned int spl_split_stack_below; typedef struct spl_stats { kstat_named_t spl_os_alloc; @@ -518,12 +540,27 @@ typedef struct spl_stats { kstat_named_t spl_spl_free; kstat_named_t spl_spl_free_manual_pressure; kstat_named_t spl_spl_free_fast_pressure; - kstat_named_t spl_spl_free_delta_ema; kstat_named_t spl_spl_free_negative_count; kstat_named_t spl_osif_malloc_success; + kstat_named_t spl_osif_malloc_fail; kstat_named_t spl_osif_malloc_bytes; kstat_named_t spl_osif_free; kstat_named_t spl_osif_free_bytes; + + kstat_named_t spl_enforce_memory_caps; + kstat_named_t spl_dynamic_memory_cap; + kstat_named_t spl_dynamic_memory_cap_skipped; + kstat_named_t spl_dynamic_memory_cap_reductions; + kstat_named_t spl_dynamic_memory_cap_hit_floor; + kstat_named_t spl_manual_memory_cap; + kstat_named_t spl_memory_cap_enforcements; + + kstat_named_t spl_osif_malloc_sub128k; + kstat_named_t spl_osif_malloc_sub64k; + kstat_named_t spl_osif_malloc_sub32k; + kstat_named_t spl_osif_malloc_page; + kstat_named_t spl_osif_malloc_subpage; + kstat_named_t spl_bucket_non_pow2_allocs; kstat_named_t spl_vmem_unconditional_allocs; @@ -533,20 +570,15 @@ typedef struct spl_stats { kstat_named_t spl_vmem_conditional_alloc_deny; kstat_named_t spl_vmem_conditional_alloc_deny_bytes; - kstat_named_t spl_xat_success; - kstat_named_t spl_xat_late_success; - kstat_named_t spl_xat_late_success_nosleep; kstat_named_t spl_xat_pressured; kstat_named_t spl_xat_bailed; - kstat_named_t spl_xat_bailed_contended; kstat_named_t spl_xat_lastalloc; kstat_named_t spl_xat_lastfree; - kstat_named_t spl_xat_forced; kstat_named_t spl_xat_sleep; - kstat_named_t spl_xat_late_deny; - kstat_named_t spl_xat_no_waiters; - kstat_named_t spl_xft_wait; + kstat_named_t spl_vba_fastpath; + kstat_named_t spl_vba_fastexit; + kstat_named_t spl_vba_slowpath; kstat_named_t spl_vba_parent_memory_appeared; kstat_named_t spl_vba_parent_memory_blocked; kstat_named_t spl_vba_hiprio_blocked; @@ -573,10 +605,10 @@ typedef struct spl_stats { kstat_named_t spl_vm_pages_reclaimed; kstat_named_t spl_vm_pages_wanted; kstat_named_t spl_vm_pressure_level; - kstat_named_t spl_lowest_alloc_stack_remaining; kstat_named_t spl_lowest_vdev_disk_stack_remaining; kstat_named_t spl_lowest_zvol_stack_remaining; + kstat_named_t spl_split_stack_below; } spl_stats_t; static spl_stats_t spl_stats = { @@ -589,12 +621,27 @@ static spl_stats_t spl_stats = { {"spl_spl_free", KSTAT_DATA_INT64}, {"spl_spl_free_manual_pressure", KSTAT_DATA_UINT64}, {"spl_spl_free_fast_pressure", KSTAT_DATA_UINT64}, - {"spl_spl_free_delta_ema", KSTAT_DATA_UINT64}, {"spl_spl_free_negative_count", KSTAT_DATA_UINT64}, {"spl_osif_malloc_success", KSTAT_DATA_UINT64}, + {"spl_osif_malloc_fail", KSTAT_DATA_UINT64}, {"spl_osif_malloc_bytes", KSTAT_DATA_UINT64}, {"spl_osif_free", KSTAT_DATA_UINT64}, {"spl_osif_free_bytes", KSTAT_DATA_UINT64}, + + {"spl_osif_enforce_memory_caps", KSTAT_DATA_UINT64}, + {"spl_osif_dynamic_memory_cap", KSTAT_DATA_UINT64}, + {"spl_osif_dynamic_memory_cap_skipped", KSTAT_DATA_UINT64}, + {"spl_osif_dynamic_memory_cap_reductions", KSTAT_DATA_UINT64}, + {"spl_osif_dynamic_memory_cap_hit_floor", KSTAT_DATA_UINT64}, + {"spl_osif_manual_memory_cap", KSTAT_DATA_UINT64}, + {"spl_osif_memory_cap_enforcements", KSTAT_DATA_UINT64}, + + {"spl_osif_malloc_sub128k", KSTAT_DATA_UINT64}, + {"spl_osif_malloc_sub64k", KSTAT_DATA_UINT64}, + {"spl_osif_malloc_sub32k", KSTAT_DATA_UINT64}, + {"spl_osif_malloc_page", KSTAT_DATA_UINT64}, + {"spl_osif_malloc_subpage", KSTAT_DATA_UINT64}, + {"spl_bucket_non_pow2_allocs", KSTAT_DATA_UINT64}, {"vmem_unconditional_allocs", KSTAT_DATA_UINT64}, @@ -604,20 +651,14 @@ static spl_stats_t spl_stats = { {"vmem_conditional_alloc_deny", KSTAT_DATA_UINT64}, {"vmem_conditional_alloc_deny_bytes", KSTAT_DATA_UINT64}, - {"spl_xat_success", KSTAT_DATA_UINT64}, - {"spl_xat_late_success", KSTAT_DATA_UINT64}, - {"spl_xat_late_success_nosleep", KSTAT_DATA_UINT64}, {"spl_xat_pressured", KSTAT_DATA_UINT64}, - {"spl_xat_bailed", KSTAT_DATA_UINT64}, - {"spl_xat_bailed_contended", KSTAT_DATA_UINT64}, {"spl_xat_lastalloc", KSTAT_DATA_UINT64}, {"spl_xat_lastfree", KSTAT_DATA_UINT64}, - {"spl_xat_forced", KSTAT_DATA_UINT64}, {"spl_xat_sleep", KSTAT_DATA_UINT64}, - {"spl_xat_late_deny", KSTAT_DATA_UINT64}, - {"spl_xat_no_waiters", KSTAT_DATA_UINT64}, - {"spl_xft_wait", KSTAT_DATA_UINT64}, + {"spl_vba_fastpath", KSTAT_DATA_UINT64}, + {"spl_vba_fastexit", KSTAT_DATA_UINT64}, + {"spl_vba_slowpath", KSTAT_DATA_UINT64}, {"spl_vba_parent_memory_appeared", KSTAT_DATA_UINT64}, {"spl_vba_parent_memory_blocked", KSTAT_DATA_UINT64}, {"spl_vba_hiprio_blocked", KSTAT_DATA_UINT64}, @@ -644,11 +685,10 @@ static spl_stats_t spl_stats = { {"spl_vm_pages_reclaimed", KSTAT_DATA_UINT64}, {"spl_vm_pages_wanted", KSTAT_DATA_UINT64}, {"spl_vm_pressure_level", KSTAT_DATA_UINT64}, - {"lowest_alloc_stack_remaining", KSTAT_DATA_UINT64}, {"lowest_vdev_disk_stack_remaining", KSTAT_DATA_UINT64}, {"lowest_zvol_stack_remaining", KSTAT_DATA_UINT64}, - + {"split_stack_below", KSTAT_DATA_UINT64}, }; static kstat_t *spl_ksp = 0; @@ -721,11 +761,11 @@ copy_pattern(uint64_t pattern, void *buf_arg, size_t size) *buf++ = pattern; } -static void * -verify_pattern(uint64_t pattern, void *buf_arg, size_t size) +static const void * +verify_pattern(uint64_t pattern, const void *buf_arg, size_t size) { - uint64_t *bufend = (uint64_t *)((char *)buf_arg + size); - uint64_t *buf; + const uint64_t *bufend = (const uint64_t *)((char *)buf_arg + size); + const uint64_t *buf; for (buf = buf_arg; buf < bufend; buf++) if (*buf != pattern) @@ -790,7 +830,7 @@ kmem_cache_applyall_id(void (*func)(kmem_cache_t *), taskq_t *tq, int tqflag) * Debugging support. Given a buffer address, find its slab. */ static kmem_slab_t * -kmem_findslab(kmem_cache_t *cp, void *buf) +kmem_findslab(kmem_cache_t *cp, const void *buf) { kmem_slab_t *sp; @@ -815,14 +855,14 @@ kmem_findslab(kmem_cache_t *cp, void *buf) } static void -kmem_error(int error, kmem_cache_t *cparg, void *bufarg) +kmem_error(int error, kmem_cache_t *cparg, const void *bufarg) { kmem_buftag_t *btp = NULL; kmem_bufctl_t *bcp = NULL; kmem_cache_t *cp = cparg; kmem_slab_t *sp; - uint64_t *off; - void *buf = bufarg; + const uint64_t *off; + const void *buf = bufarg; kmem_logging = 0; /* stop logging when a bad thing happens */ @@ -883,10 +923,15 @@ kmem_error(int error, kmem_cache_t *cparg, void *bufarg) case KMERR_MODIFIED: TraceEvent(TRACE_ERROR, "buffer modified after being" " freed\n"); + dprintf("buffer modified after being freed\n"); off = verify_pattern(KMEM_FREE_PATTERN, buf, cp->cache_verify); if (off == NULL) /* shouldn't happen */ off = buf; + dprintf("SPL: modification occurred at offset 0x%lx " + "(0x%llx replaced by 0x%llx)\n", + (uintptr_t)off - (uintptr_t)buf, + (longlong_t)KMEM_FREE_PATTERN, (longlong_t)*off); TraceEvent(TRACE_ERROR, "SPL: modification occurred " "at offset 0x%lx (0x%llx replaced by 0x%llx)\n", (uintptr_t)off - (uintptr_t)buf, @@ -894,21 +939,28 @@ kmem_error(int error, kmem_cache_t *cparg, void *bufarg) break; case KMERR_REDZONE: + dprintf("redzone violation: write past end of buf\n"); TraceEvent(TRACE_ERROR, "redzone violation: write past" " end of buffer\n"); break; case KMERR_BADADDR: + dprintf("invalid free: buffer not in cache\n"); TraceEvent(TRACE_ERROR, "invalid free: buffer not in" " cache\n"); break; case KMERR_DUPFREE: + dprintf("duplicate free: buffer freed twice\n"); TraceEvent(TRACE_ERROR, "duplicate free: buffer freed" " twice\n"); break; case KMERR_BADBUFTAG: + dprintf("boundary tag corrupted\n"); + dprintf("SPL: bcp ^ bxstat = %lx, should be %lx\n", + (intptr_t)btp->bt_bufctl ^ btp->bt_bxstat, + KMEM_BUFTAG_FREE); TraceEvent(TRACE_ERROR, "boundary tag corrupted\n"); TraceEvent(TRACE_ERROR, "SPL: bcp ^ bxstat = %lx, " "should be %lx\n", @@ -917,10 +969,16 @@ kmem_error(int error, kmem_cache_t *cparg, void *bufarg) break; case KMERR_BADBUFCTL: + dprintf("bufctl corrupted\n"); TraceEvent(TRACE_ERROR, "bufctl corrupted\n"); break; case KMERR_BADCACHE: + dprintf("buffer freed to wrong cache\n"); + dprintf("SPL: buffer was allocated from %s,\n", + cp->cache_name); + dprintf("SPL: caller attempting free to %s.\n", + cparg->cache_name); TraceEvent(TRACE_ERROR, "buffer freed to wrong " "cache\n"); TraceEvent(TRACE_ERROR, "SPL: buffer was allocated" @@ -930,6 +988,9 @@ kmem_error(int error, kmem_cache_t *cparg, void *bufarg) break; case KMERR_BADSIZE: + dprintf("bad free: free size (%u) != alloc size (%u)\n", + KMEM_SIZE_DECODE(((uint32_t *)btp)[0]), + KMEM_SIZE_DECODE(((uint32_t *)btp)[1])); TraceEvent(TRACE_ERROR, "bad free: free size (%u) !=" " alloc size (%u)\n", KMEM_SIZE_DECODE(((uint32_t *)btp)[0]), @@ -937,6 +998,8 @@ kmem_error(int error, kmem_cache_t *cparg, void *bufarg) break; case KMERR_BADBASE: + dprintf("bad free: free address (%p) != alloc address" + " (%p)\n", bufarg, buf); TraceEvent(TRACE_ERROR, "bad free: free address" " (%p) != alloc address (%p)\n", bufarg, buf); break; @@ -1376,7 +1439,7 @@ static void kmem_slab_move_yes(kmem_cache_t *, kmem_slab_t *, void *); static void kmem_slab_free(kmem_cache_t *cp, void *buf) { - kmem_slab_t *sp; + kmem_slab_t *sp = NULL; kmem_bufctl_t *bcp, **prev_bcpp; ASSERT(buf != NULL); @@ -1754,12 +1817,24 @@ kmem_depot_ws_zero(kmem_cache_t *cp) } /* - * The number of bytes to reap before we call kpreempt(). The default (1MB) - * causes us to preempt reaping up to hundres of times per second. Using a - * larger value (1GB) causes this to have virtually no effect. + * The number of bytes to reap before we call kpreempt(). + * + * There is a tradeoff between potentially many many preempts when giving + * freeing a large amount of ARC scatter ABDs (the preempts slightly slow down + * the return of memory to parent arenas during a larger reap, which in turn + * slightly delays the return of memory to the operating system) versus + * letting other threads on low-core-count machines make forward progress + * (which was upstream's goal when reap preemption was first introduced) or + * (in more modern times) gaining efficiencies in busy high-core-count + * machines that can have many threads allocating while an inevitably + * long-lived reap is in progress, narrowing the possibility of destroying + * kmem structures that might have to be rebuilt during the next preemption. + * + * Historically 1M was the value from upstream, which was increased for o3x + * for performance reasons. The reap mechanisms have evolved such that 1M + * is once again the better default. */ -size_t kmem_reap_preempt_bytes = 64 * 1024 * 1024; - +size_t kmem_reap_preempt_bytes = 1024 * 1024; /* * Reap all magazines that have fallen out of the depot's working set. @@ -1774,6 +1849,22 @@ kmem_depot_ws_reap(kmem_cache_t *cp) ASSERT(!list_link_active(&cp->cache_link) || taskq_member(kmem_taskq, curthread)); + bool mtx_contended = false; + + if (!mutex_tryenter(&cp->cache_reap_lock)) { + mtx_contended = true; + dprintf("ZFS: SPL: %s:%s:%d: could not get lock\n", + __FILE__, __func__, __LINE__); + IOSleep(1); + mutex_enter(&cp->cache_reap_lock); + } + + if (mtx_contended) + dprintf("ZFS: SPL: %s:%s:%d: reap mutex for %s " + "was contended\n", + __FILE__, __func__, __LINE__, + cp->cache_name); + reap = MIN(cp->cache_full.ml_reaplimit, cp->cache_full.ml_min); while (reap-- && (mp = kmem_depot_alloc(cp, &cp->cache_full)) != NULL) { @@ -1795,6 +1886,8 @@ kmem_depot_ws_reap(kmem_cache_t *cp) bytes = 0; } } + + mutex_exit(&cp->cache_reap_lock); } static void @@ -1807,7 +1900,7 @@ kmem_cpu_reload(kmem_cpu_cache_t *ccp, kmem_magazine_t *mp, int rounds) ccp->cc_ploaded = ccp->cc_loaded; ccp->cc_prounds = ccp->cc_rounds; ccp->cc_loaded = mp; - ccp->cc_rounds = rounds; + ccp->cc_rounds = (short)rounds; } /* @@ -1951,7 +2044,6 @@ kmem_dump_finish(char *buf, size_t size) int kdi_end = kmem_dump_log_idx; int percent = 0; int header = 0; - int warn = 0; size_t used; kmem_cache_t *cp; kmem_dump_log_t *kdl; @@ -1969,7 +2061,7 @@ kmem_dump_finish(char *buf, size_t size) kmem_dumppr(&p, e, "heap size,%ld\n", kmem_dump_size); kmem_dumppr(&p, e, "Oversize allocs,%d\n", kmem_dump_oversize_allocs); - kmem_dumppr(&p, e, "Oversize max size,%ld\n", + kmem_dumppr(&p, e, "Oversize max size,%u\n", kmem_dump_oversize_max); for (kdi_idx = 0; kdi_idx < kdi_end; kdi_idx++) { @@ -1977,8 +2069,6 @@ kmem_dump_finish(char *buf, size_t size) cp = kdl->kdl_cache; if (cp == NULL) break; - if (kdl->kdl_alloc_fails) - ++warn; if (header == 0) { kmem_dumppr(&p, e, "Cache Name,Allocs,Frees,Alloc Fails," @@ -2354,7 +2444,7 @@ kmem_cache_parent_arena_fragmented(kmem_cache_t *cp) * Free a constructed object to cache cp. */ void -kmem_cache_free(kmem_cache_t *cp, void *buf) +kmem_cache_free(kmem_cache_t *cp, const void *buf) { kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp); @@ -2372,7 +2462,8 @@ kmem_cache_free(kmem_cache_t *cp, void *buf) ASSERT(!(ccp->cc_flags & KMF_DUMPDIVERT)); /* log it so that we can warn about it */ KDI_LOG(cp, kdl_unsafe); - } else if (KMEM_DUMPCC(ccp) && !kmem_cache_free_dump(cp, buf)) { + } else if (KMEM_DUMPCC(ccp) && !kmem_cache_free_dump(cp, + __DECONST(void *, buf))) { return; } if (ccp->cc_flags & KMF_BUFTAG) { @@ -2391,7 +2482,8 @@ kmem_cache_free(kmem_cache_t *cp, void *buf) * loaded magazine, just put the object there and return. */ if ((uint_t)ccp->cc_rounds < ccp->cc_magsize) { - ccp->cc_loaded->mag_round[ccp->cc_rounds++] = buf; + ccp->cc_loaded->mag_round[ccp->cc_rounds++] = + __DECONST(void *, buf); ccp->cc_free++; mutex_exit(&ccp->cc_lock); return; @@ -2439,7 +2531,7 @@ kmem_cache_free(kmem_cache_t *cp, void *buf) } mutex_exit(&ccp->cc_lock); kpreempt(KPREEMPT_SYNC); - kmem_slab_free_constructed(cp, buf, B_TRUE); + kmem_slab_free_constructed(cp, __DECONST(void *, buf), B_TRUE); } /* @@ -2652,7 +2744,7 @@ zfs_kmem_alloc(size_t size, int kmflag) } void -zfs_kmem_free(void *buf, size_t size) +zfs_kmem_free(const void *buf, size_t size) { size_t index; kmem_cache_t *cp; @@ -2774,7 +2866,8 @@ kmem_reap_timeout(void *flag_arg) ASSERT(flag == (uint32_t *)&kmem_reaping || flag == (uint32_t *)&kmem_reaping_idspace); - *flag = 0; + __atomic_store_n(flag, 0, __ATOMIC_RELEASE); + ASSERT3U(*flag, ==, 0); } static void @@ -2813,19 +2906,27 @@ kmem_reap_common(void *flag_arg) { uint32_t *flag = (uint32_t *)flag_arg; + ASSERT(flag == &kmem_reaping || flag == &kmem_reaping_idspace); + /* If conditions are met, try to set flag to 1 */ if (MUTEX_HELD(&kmem_cache_lock) || kmem_taskq == NULL || atomic_cas_32(flag, 0, 1) != 0) return; + /* + * If we are here, the appropriate flag is 1. It will be atomically + * zeroed after the reaping has finished and the timeout has expired. + */ /* - * It may not be kosher to do memory allocation when a reap is called + * It may not be safe to do memory allocation when a reap is called * is called (for example, if vmem_populate() is in the call chain). * So we start the reap going with a TQ_NOALLOC dispatch. If the * dispatch fails, we reset the flag, and the next reap will try again. */ - if (!taskq_dispatch(kmem_taskq, kmem_reap_start, flag, TQ_NOALLOC)) - *flag = 0; + if (!taskq_dispatch(kmem_taskq, kmem_reap_start, flag, TQ_NOALLOC)) { + __atomic_store_n(flag, 0, __ATOMIC_RELEASE); + ASSERT3U(*flag, ==, 0); + } } /* @@ -2939,21 +3040,36 @@ kmem_cache_magazine_disable(kmem_cache_t *cp) boolean_t kmem_cache_reap_active(void) { - return (B_FALSE); + return (kmem_reaping.flag); } /* - * Reap (almost) everything right now. + * Fire off a kmem_reap(); that will put a kmem_reap_start() into the taskq if + * conditions are favourable. + * + * This function can be frequently called by common code. Arguably it is + * over-called. + * + * Previously, a kmem_depot_ws_zero(cp) would erase the working set + * information of the kmem cache; it is probably better to let other events + * evolve the magazine working set. + * + * Also previously, a kmem_depot_ws_reap(cp) was dispatched on the kmem taskq. + * This appears to have some unsafeness with respect to concurrency, and this + * unconditional start-a-reap-right-now approach was abandoned by the other + * openzfs ports. On macOS there does not seem to be an advantage in stepping + * around the kmem_reap{,common,start,timeout}() concurrency-controlling + * mechanism (atomic compare-and-swap on kmem_reaping, with an atomic set to + * zero after a delay once the reaping task is done). Moreover, skipping the + * kmem_reaping flag check may have led to double-frees of destroyed depots to + * qcache-equipped vmem arenas. */ void -kmem_cache_reap_now(kmem_cache_t *cp) +kmem_cache_reap_now(kmem_cache_t *cp __maybe_unused) { ASSERT(list_link_active(&cp->cache_link)); - kmem_depot_ws_zero(cp); - - (void) taskq_dispatch(kmem_taskq, - (task_func_t *)kmem_depot_ws_reap, cp, TQ_SLEEP); + kmem_reap(); } /* @@ -3285,7 +3401,7 @@ kmem_cache_stat(kmem_cache_t *cp, char *name) // TRUE if we have more than a critical minimum of memory // used in arc_memory_throttle; if FALSE, we throttle -bool +static bool spl_minimal_physmem_p_logic() { // do we have enough memory to avoid throttling? @@ -3318,13 +3434,7 @@ spl_minimal_physmem_p(void) size_t kmem_maxavail(void) { -#ifndef APPLE - // spgcnt_t pmem = availrmem - tune.t_minarmem; - // spgcnt_t vmem = btop(vmem_size(heap_arena, VMEM_FREE)); - // - // return ((size_t)ptob(MAX(MIN(pmem, vmem), 0))); -#endif - return (physmem * PAGE_SIZE); + return (total_memory); } /* @@ -3624,7 +3734,7 @@ kmem_cache_create( ASSERT(chunksize + sizeof (kmem_slab_t) <= cp->cache_slabsize); ASSERT(!(cp->cache_flags & KMF_AUDIT)); } else { - size_t chunks, bestfit, waste, slabsize; + size_t chunks, bestfit = 0, waste, slabsize; size_t minwaste = LONG_MAX; for (chunks = 1; chunks <= KMEM_VOID_FRACTION; chunks++) { @@ -3670,6 +3780,8 @@ kmem_cache_create( cp->cache_color = cp->cache_mincolor; + mutex_init(&cp->cache_reap_lock, NULL, MUTEX_DEFAULT, NULL); + /* * Initialize the rest of the slab layer. */ @@ -3892,6 +4004,13 @@ kmem_cache_destroy(kmem_cache_t *cp) kmem_cache_magazine_purge(cp); + /* + * make sure there isn't a reaper + * since it would dereference cp + */ + mutex_enter(&cp->cache_reap_lock); + mutex_exit(&cp->cache_reap_lock); + mutex_enter(&cp->cache_lock); if (cp->cache_buftotal != 0) @@ -3934,6 +4053,7 @@ kmem_cache_destroy(kmem_cache_t *cp) mutex_destroy(&cp->cache_depot_lock); mutex_destroy(&cp->cache_lock); + mutex_destroy(&cp->cache_reap_lock); vmem_free_impl(kmem_cache_arena, cp, KMEM_CACHE_SIZE(max_ncpus)); } @@ -4139,7 +4259,15 @@ kmem_cache_init(int pass, int use_large_pages) kmem_big_alloc_table_max = maxbuf >> KMEM_BIG_SHIFT; } +/* + * At kext unload, kmem_cache_build_slablist() builds a list of free slabs + * from all kmem caches, so kmem_cache_fini() can report the leaks and the + * total number of leaks. + */ + struct free_slab { + char vm_name[VMEM_NAMELEN]; + char cache_name[KMEM_CACHE_NAMELEN + 1]; vmem_t *vmp; size_t slabsize; void *slab; @@ -4148,7 +4276,6 @@ struct free_slab { static list_t freelist; - void kmem_cache_build_slablist(kmem_cache_t *cp) { @@ -4163,6 +4290,9 @@ kmem_cache_build_slablist(kmem_cache_t *cp) MALLOC(fs, struct free_slab *, sizeof (struct free_slab), M_TEMP, M_WAITOK); + strlcpy(fs->vm_name, vmp->vm_name, VMEM_NAMELEN); + strlcpy(fs->cache_name, cp->cache_name, + KMEM_CACHE_NAMELEN); fs->vmp = vmp; fs->slabsize = cp->cache_slabsize; fs->slab = (void *)P2ALIGN((uintptr_t)sp->slab_base, @@ -4176,6 +4306,9 @@ kmem_cache_build_slablist(kmem_cache_t *cp) MALLOC(fs, struct free_slab *, sizeof (struct free_slab), M_TEMP, M_WAITOK); + strlcpy(fs->vm_name, vmp->vm_name, VMEM_NAMELEN); + strlcpy(fs->cache_name, cp->cache_name, + KMEM_CACHE_NAMELEN); fs->vmp = vmp; fs->slabsize = cp->cache_slabsize; fs->slab = (void *)P2ALIGN((uintptr_t)sp->slab_base, @@ -4225,19 +4358,132 @@ kmem_cache_fini() i = 0; while ((fs = list_head(&freelist))) { i++; + dprintf("SPL: %s:%d: released %lu from '%s' to '%s'\n", + __func__, __LINE__, + fs->slabsize, + fs->cache_name, + fs->vm_name); list_remove(&freelist, fs); vmem_free_impl(fs->vmp, fs->slab, fs->slabsize); FREE(fs, M_TEMP); } - xprintf("SPL: Released %u slabs\n", i); + dprintf("SPL: %s:%d: Released %u slabs TOTAL\n", + __func__, __LINE__, i); + list_destroy(&freelist); } -// this is intended to substitute for kmem_avail() in arc.c +/* + * Reduce dynamic memory cap by a set amount ("reduction"), unless the cap is + * already 1/8 of total_memory or lower. unlike the logic in + * spl-vmem.c:xnu_alloc_throttled(), we likely have not observed xnu being + * ready to deny us memory, so we drop half the cap half as much. + * + * Inter-thread synchronization of spl_dynamic_memory_cap and spl_free here in + * the next two functions is important as there _will_ be multi-core bursts + * of spl_free_wrapper() calls. + */ +int64_t +spl_reduce_dynamic_cap(void) +{ + /* + * take a snapshot of spl_dynamic_memory_cap, which + * may drop while we are in this function + */ + const uint64_t cap_in = spl_dynamic_memory_cap; + + const uint64_t reduce_amount = total_memory >> 8; + + const int64_t thresh = total_memory >> 3; + + const int64_t reduction = (int64_t)(cap_in - reduce_amount); + + const int64_t reduced = MAX(reduction, thresh); + + /* + * Adjust cap downwards if enough time has elapsed + * for previous adjustments to shrink memory use. + * + * We will still tell ARC to shrink by thresh. + */ + mutex_enter(&spl_dynamic_memory_cap_lock); + + const hrtime_t now = gethrtime(); + if (now > spl_dynamic_memory_cap_last_downward_adjust + + SEC2NSEC(60)) { + + if (spl_dynamic_memory_cap == 0 || + spl_dynamic_memory_cap > total_memory) { + spl_dynamic_memory_cap_last_downward_adjust = now; + spl_dynamic_memory_cap = total_memory - reduce_amount; + atomic_inc_64(&spl_dynamic_memory_cap_reductions); + } else if (spl_dynamic_memory_cap > reduced) { + spl_dynamic_memory_cap_last_downward_adjust = now; + spl_dynamic_memory_cap = reduced; + atomic_inc_64(&spl_dynamic_memory_cap_reductions); + } else if (spl_dynamic_memory_cap <= thresh) { + spl_dynamic_memory_cap_last_downward_adjust = now; + spl_dynamic_memory_cap = thresh; + atomic_inc_64(&spl_dynamic_memory_cap_hit_floor); + } else { + atomic_inc_64(&spl_dynamic_memory_cap_skipped); + } + } else { + atomic_inc_64(&spl_dynamic_memory_cap_skipped); + } + + mutex_exit(&spl_dynamic_memory_cap_lock); + + const uint64_t cap_out = spl_dynamic_memory_cap; + const int64_t cap_diff = cap_out - cap_in; + const int64_t minusthresh = -(int64_t)thresh; + + if (cap_diff > minusthresh) { + spl_free = minusthresh; + return (minusthresh); + } else { + spl_free = cap_diff; + return (cap_diff); + } +} + +/* + * This substitutes for kmem_avail() in arc_os.c + * + * If we believe there is free memory but memory caps are active, enforce on + * them, decrementing the dynamic cap if necessary, returning a non-positive + * free memory to ARC if we have reached either enforced cap. + */ int64_t spl_free_wrapper(void) { +// MEMORYSTATUSEX memInfo; +// memInfo.dwLength = sizeof(MEMORYSTATUSEX); + + if (spl_enforce_memory_caps != 0 && spl_free > 0) { + if (segkmem_total_mem_allocated >= + spl_dynamic_memory_cap) { + atomic_inc_64(&spl_memory_cap_enforcements); + spl_set_arc_no_grow(B_TRUE); + return (spl_reduce_dynamic_cap()); + } else if (spl_manual_memory_cap > 0 && + segkmem_total_mem_allocated >= spl_manual_memory_cap) { + spl_set_arc_no_grow(B_TRUE); + atomic_inc_64(&spl_memory_cap_enforcements); + const int64_t dec = spl_manual_memory_cap - + segkmem_total_mem_allocated; + const int64_t giveback = -(total_memory >> 10); + if (dec > giveback) { + spl_free = giveback; + return (giveback); + } else { + spl_free = dec; + return (dec); + } + } + } + return (spl_free); } @@ -4296,7 +4542,7 @@ spl_free_set_and_wait_pressure(int64_t new_p, boolean_t fast, TraceEvent(TRACE_ERROR, "%s: ERROR: timed out " "after one minute!\n", __func__); break; - } else if (now > double_again_at && !doubled_again) { + } else if (doubled && now > double_again_at && !doubled_again) { doubled_again = true; new_p *= 2; } else if (now > double_at) { @@ -4317,7 +4563,9 @@ spl_free_set_pressure(int64_t new_p) spl_free_fast_pressure = FALSE; // wake up both spl_free_thread() to recalculate spl_free // and any spl_free_set_and_wait_pressure() threads - cv_broadcast(&spl_free_thread_cv); + mutex_enter(&spl_free_thread_lock); + cv_signal(&spl_free_thread_cv); + mutex_exit(&spl_free_thread_lock); } spl_free_last_pressure = zfs_lbolt(); } @@ -4430,11 +4678,6 @@ static void spl_free_thread() { callb_cpr_t cpr; - uint64_t last_update = zfs_lbolt(); - int64_t last_spl_free; - double ema_new = 0; - double ema_old = 0; - double alpha; CALLB_CPR_INIT(&cpr, &spl_free_thread_lock, callb_generic_cpr, FTAG); @@ -4442,6 +4685,9 @@ spl_free_thread() spl_free = MAX(4*1024*1024*1024, total_memory * 75ULL / 100ULL); + if (spl_dynamic_memory_cap == 0) + spl_dynamic_memory_cap = total_memory; + mutex_enter(&spl_free_thread_lock); dprintf("SPL: beginning spl_free_thread() loop, spl_free == %lld\n", @@ -4454,7 +4700,6 @@ spl_free_thread() mutex_exit(&spl_free_thread_lock); boolean_t lowmem = false; boolean_t emergency_lowmem = false; - int64_t base; int64_t new_spl_free = 0LL; spl_stats.spl_free_wake_count.value.ui64++; @@ -4469,8 +4714,6 @@ spl_free_thread() if (time_now > hz) time_now_seconds = time_now / hz; - last_spl_free = spl_free; - new_spl_free = total_memory - segkmem_total_mem_allocated; @@ -4489,7 +4732,11 @@ spl_free_thread() // uint32_t pages_reclaimed = 0; // uint32_t pages_wanted = 0; -/* get pressure here */ + // XNU calls mach_vm_pressure_monitor() which + // fills in pages_reclaimed and pages_wanted. + // then assign them to spl_vm_pages_reclaimed and + // spl_vm_pages_wanted + // Windows event thread will set them for us. if (spl_vm_pressure_level > 0 && spl_vm_pressure_level != MAGIC_PRESSURE_UNAVAILABLE) { @@ -4555,6 +4802,34 @@ spl_free_thread() } } + /* + * Pressure and declare zero free memory if we are above + * memory caps. This is not the hardest enforcement + * mechanism, so see also enforcement in spl_free_wrapper() + */ + if (spl_enforce_memory_caps) { + if (segkmem_total_mem_allocated >= + spl_dynamic_memory_cap) { + lowmem = true; + emergency_lowmem = true; + if (new_spl_free >= 0) + new_spl_free = + spl_dynamic_memory_cap - + segkmem_total_mem_allocated; + atomic_inc_64(&spl_memory_cap_enforcements); + } else if (spl_manual_memory_cap > 0 && + segkmem_total_mem_allocated >= + spl_manual_memory_cap) { + lowmem = true; + emergency_lowmem = true; + if (new_spl_free >= 0) + new_spl_free = + spl_manual_memory_cap - + segkmem_total_mem_allocated; + atomic_inc_64(&spl_memory_cap_enforcements); + } + } + /* * can we allocate at least a 64 MiB segment * from spl_heap_arena? this probes the reserve @@ -4742,8 +5017,6 @@ spl_free_thread() recent_lowmem = 0; } - base = new_spl_free; - // adjust for available memory in spl_heap_arena // cf arc_available_memory() if (!emergency_lowmem) { @@ -4797,8 +5070,6 @@ spl_free_thread() new_spl_free = -1024LL; } - double delta = (double)new_spl_free - (double)last_spl_free; - boolean_t spl_free_is_negative = false; if (new_spl_free < 0LL) { @@ -4817,6 +5088,20 @@ spl_free_thread() new_spl_free = 2LL * spamaxblksz; } + if (spl_enforce_memory_caps != 0) { + if (spl_dynamic_memory_cap != 0) { + const int64_t m = spl_dynamic_memory_cap - + segkmem_total_mem_allocated; + if (new_spl_free > m) + new_spl_free = m; + } else if (spl_manual_memory_cap != 0) { + const int64_t m = spl_manual_memory_cap - + segkmem_total_mem_allocated; + if (new_spl_free > m) + new_spl_free = m; + } + } + // NOW set spl_free from calculated new_spl_free spl_free = new_spl_free; // the direct equivalent of : @@ -4854,18 +5139,6 @@ spl_free_thread() if (lowmem) recent_lowmem = time_now; - // maintain an exponential moving average for the ema kstat - if (last_update > hz) - alpha = 1.0; - else { - double td_tick = (double)(time_now - last_update); - alpha = td_tick / (double)(hz*50.0); // roughly 0.02 - } - - ema_new = (alpha * delta) + (1.0 - alpha)*ema_old; - spl_free_delta_ema = ema_new; - ema_old = ema_new; - justwait: mutex_enter(&spl_free_thread_lock); CALLB_CPR_SAFE_BEGIN(&cpr); @@ -4883,12 +5156,20 @@ spl_free_thread() thread_exit(); } +/* + * Windows specific pressure monitor + * We expect this function to set + * spl_vm_pages_reclaimed + * spl_vm_pages_wanted + * spl_vm_pressure_level + * (kVMPressureNormal=0, Warning=1, Urgent=2, Critical=3) + */ static void spl_event_thread(void *notused) { // callb_cpr_t cpr; NTSTATUS Status; - + LARGE_INTEGER timeout; DECLARE_CONST_UNICODE_STRING(low_mem_name, L"\\KernelObjects\\LowMemoryCondition"); HANDLE low_mem_handle; @@ -4905,24 +5186,44 @@ spl_event_thread(void *notused) dprintf("SPL: beginning spl_event_thread() loop\n"); + timeout.QuadPart = -SEC2NSEC100(30); // 30 seconds. + while (!spl_event_thread_exit) { /* Don't busy loop */ delay(hz); - /* Sleep forever waiting for event */ + /* + * Sleep up to 30s waiting for event, if timeout + * we assume the system is not "low memory". + */ Status = KeWaitForSingleObject(low_mem_event, Executive, - KernelMode, FALSE, NULL); + KernelMode, FALSE, &timeout); KeClearEvent(low_mem_event); - dprintf("%s: LOWMEMORY EVENT *** 0x%x (memusage: %llu)\n", - __func__, Status, segkmem_total_mem_allocated); - /* We were signalled */ - // vm_page_free_wanted = vm_page_free_min; - spl_free_set_pressure(spl_vm_page_free_min); - cv_broadcast(&spl_free_thread_cv); - } + if (Status == STATUS_TIMEOUT) { + + spl_vm_pages_reclaimed = 0; + if (spl_vm_pressure_level > 0) + spl_vm_pressure_level--; + else + spl_vm_pages_wanted = 0; + + } else { + dprintf( + "%s: LOWMEMORY EVENT *** 0x%x (memusage: %llu)\n", + __func__, Status, segkmem_total_mem_allocated); + /* We were signalled */ + // vm_page_free_wanted = vm_page_free_min; + // spl_free_set_pressure(spl_vm_page_free_min); + spl_vm_pages_reclaimed = 0; + spl_vm_pages_wanted += spl_vm_page_free_min; + if (spl_vm_pressure_level < 3) + spl_vm_pressure_level++; + cv_broadcast(&spl_free_thread_cv); + } + } ZwClose(low_mem_handle); spl_event_thread_exit = FALSE; @@ -4977,6 +5278,43 @@ spl_kstat_update(kstat_t *ksp, int rw) ks->kmem_free_to_slab_when_fragmented.value.ui64; } + if ((unsigned int) ks->spl_split_stack_below.value.ui64 != + spl_split_stack_below) { + spl_split_stack_below = + (unsigned int) + ks->spl_split_stack_below.value.ui64; + } + + if (ks->spl_enforce_memory_caps.value.ui64 != + spl_enforce_memory_caps) { + spl_enforce_memory_caps = + ks->spl_enforce_memory_caps.value.ui64; + } + + if (ks->spl_manual_memory_cap.value.ui64 != + spl_manual_memory_cap) { + uint64_t v = + ks->spl_manual_memory_cap.value.ui64; + if (v < total_memory >> 3) + v = total_memory >> 3; + else if (v > total_memory) + v = 0; + spl_manual_memory_cap = v; + } + + if (ks->spl_dynamic_memory_cap.value.ui64 != + spl_dynamic_memory_cap) { + uint64_t v = + ks->spl_dynamic_memory_cap.value.ui64; + if (v == 0) + v = total_memory; + else if (v < total_memory >> 3) + v = total_memory >> 3; + else if (v > total_memory) + v = total_memory; + spl_dynamic_memory_cap = v; + } + } else { ks->spl_os_alloc.value.ui64 = segkmem_total_mem_allocated; ks->spl_active_threads.value.ui64 = zfs_threads; @@ -4988,12 +5326,40 @@ spl_kstat_update(kstat_t *ksp, int rw) spl_free_manual_pressure; ks->spl_spl_free_fast_pressure.value.i64 = spl_free_fast_pressure; - ks->spl_spl_free_delta_ema.value.i64 = spl_free_delta_ema; ks->spl_osif_malloc_success.value.ui64 = stat_osif_malloc_success; + ks->spl_osif_malloc_fail.value.ui64 = + stat_osif_malloc_fail; ks->spl_osif_malloc_bytes.value.ui64 = stat_osif_malloc_bytes; ks->spl_osif_free.value.ui64 = stat_osif_free; ks->spl_osif_free_bytes.value.ui64 = stat_osif_free_bytes; + + ks->spl_enforce_memory_caps.value.ui64 = + spl_enforce_memory_caps; + ks->spl_dynamic_memory_cap.value.ui64 = + spl_dynamic_memory_cap; + ks->spl_dynamic_memory_cap_skipped.value.ui64 = + spl_dynamic_memory_cap_skipped; + ks->spl_dynamic_memory_cap_reductions.value.ui64 = + spl_dynamic_memory_cap_reductions; + ks->spl_dynamic_memory_cap_hit_floor.value.ui64 = + spl_dynamic_memory_cap_hit_floor; + ks->spl_manual_memory_cap.value.ui64 = + spl_manual_memory_cap; + ks->spl_memory_cap_enforcements.value.ui64 = + spl_memory_cap_enforcements; + + ks->spl_osif_malloc_sub128k.value.ui64 = + stat_osif_malloc_sub128k; + ks->spl_osif_malloc_sub64k.value.ui64 = + stat_osif_malloc_sub64k; + ks->spl_osif_malloc_sub32k.value.ui64 = + stat_osif_malloc_sub32k; + ks->spl_osif_malloc_page.value.ui64 = + stat_osif_malloc_page; + ks->spl_osif_malloc_subpage.value.ui64 = + stat_osif_malloc_subpage; + ks->spl_bucket_non_pow2_allocs.value.ui64 = spl_bucket_non_pow2_allocs; @@ -5010,22 +5376,17 @@ spl_kstat_update(kstat_t *ksp, int rw) ks->spl_vmem_conditional_alloc_deny_bytes.value.ui64 = spl_vmem_conditional_alloc_deny_bytes; - ks->spl_xat_success.value.ui64 = spl_xat_success; - ks->spl_xat_late_success.value.ui64 = spl_xat_late_success; - ks->spl_xat_late_success_nosleep.value.ui64 = - spl_xat_late_success_nosleep; ks->spl_xat_pressured.value.ui64 = spl_xat_pressured; - ks->spl_xat_bailed.value.ui64 = spl_xat_bailed; - ks->spl_xat_bailed_contended.value.ui64 = - spl_xat_bailed_contended; ks->spl_xat_lastalloc.value.ui64 = spl_xat_lastalloc; ks->spl_xat_lastfree.value.ui64 = spl_xat_lastfree; - ks->spl_xat_forced.value.ui64 = spl_xat_forced; ks->spl_xat_sleep.value.ui64 = spl_xat_sleep; - ks->spl_xat_late_deny.value.ui64 = spl_xat_late_deny; - ks->spl_xat_no_waiters.value.ui64 = spl_xat_no_waiters; - ks->spl_xft_wait.value.ui64 = spl_xft_wait; + ks->spl_vba_fastpath.value.ui64 = + spl_vba_fastpath; + ks->spl_vba_fastexit.value.ui64 = + spl_vba_fastexit; + ks->spl_vba_slowpath.value.ui64 = + spl_vba_slowpath; ks->spl_vba_parent_memory_appeared.value.ui64 = spl_vba_parent_memory_appeared; ks->spl_vba_parent_memory_blocked.value.ui64 = @@ -5072,6 +5433,8 @@ spl_kstat_update(kstat_t *ksp, int rw) spl_lowest_vdev_disk_stack_remaining; ks->spl_lowest_zvol_stack_remaining.value.ui64 = spl_lowest_zvol_stack_remaining; + ks->spl_split_stack_below.value.ui64 = + spl_split_stack_below; } return (0); @@ -5355,6 +5718,8 @@ spl_kmem_thread_init(void) // Initialize the spl_free locks mutex_init(&spl_free_thread_lock, "spl_free_thead_lock", MUTEX_DEFAULT, NULL); + mutex_init(&spl_dynamic_memory_cap_lock, "spl_dynamic_memory_cap_lock", + MUTEX_DEFAULT, NULL); kmem_taskq = taskq_create("kmem_taskq", 1, minclsyspri, 600, INT_MAX, TASKQ_PREPOPULATE); @@ -5392,6 +5757,8 @@ spl_kmem_thread_fini(void) cv_destroy(&spl_free_thread_cv); mutex_destroy(&spl_free_thread_lock); + mutex_destroy(&spl_dynamic_memory_cap_lock); + bsd_untimeout(kmem_update, &kmem_update_timer); bsd_untimeout(kmem_reap_timeout, &kmem_reaping); bsd_untimeout(kmem_reap_timeout, &kmem_reaping_idspace); @@ -6606,18 +6973,25 @@ kmem_cache_buf_in_cache(kmem_cache_t *cparg, void *bufarg) } if (sp == NULL) { + dprintf("SPL: %s: KMERR_BADADDR orig cache = %s\n", + __func__, cparg->cache_name); TraceEvent(TRACE_ERROR, "SPL: %s: KMERR_BADADDR orig cache =" " %s\n", __func__, cparg->cache_name); return (NULL); } if (cp == NULL) { + dprintf("SPL: %s: ERROR cp == NULL; cparg == %s", + __func__, cparg->cache_name); TraceEvent(TRACE_ERROR, "SPL: %s: ERROR cp == NULL; cparg ==" " %s", __func__, cparg->cache_name); return (NULL); } if (cp != cparg) { + dprintf("SPL: %s: KMERR_BADCACHE arg cache = %s but found " + "in %s instead\n", + __func__, cparg->cache_name, cp->cache_name); TraceEvent(TRACE_ERROR, "SPL: %s: KMERR_BADCACHE arg cache =" " %s but found in %s instead\n", __func__, cparg->cache_name, cp->cache_name); diff --git a/module/os/windows/spl/spl-kstat.c b/module/os/windows/spl/spl-kstat.c index d19115bf91aa..6970cf4a219e 100644 --- a/module/os/windows/spl/spl-kstat.c +++ b/module/os/windows/spl/spl-kstat.c @@ -700,7 +700,7 @@ kstat_free(ekstat_t *e) extern vmem_t *heap_arena; void *segkmem_alloc(vmem_t *vmp, size_t size, int vmflag); -void segkmem_free(vmem_t *vmp, void *inaddr, size_t size); +void segkmem_free(vmem_t *vmp, const void *inaddr, size_t size); /* * Create various system kstats. diff --git a/module/os/windows/spl/spl-seg_kmem.c b/module/os/windows/spl/spl-seg_kmem.c index a992610785db..44ad83133eb7 100644 --- a/module/os/windows/spl/spl-seg_kmem.c +++ b/module/os/windows/spl/spl-seg_kmem.c @@ -88,16 +88,13 @@ #ifdef _KERNEL -#define XNU_KERNEL_PRIVATE - #include - #endif /* _KERNEL */ typedef int page_t; void *segkmem_alloc(vmem_t *vmp, size_t size, int vmflag); -void segkmem_free(vmem_t *vmp, void *inaddr, size_t size); +void segkmem_free(vmem_t *vmp, const void *inaddr, size_t size); /* Total memory held allocated */ uint64_t segkmem_total_mem_allocated = 0; @@ -107,13 +104,21 @@ vmem_t *heap_arena; /* qcaches abd */ vmem_t *abd_arena; +vmem_t *abd_subpage_arena; #ifdef _KERNEL extern uint64_t total_memory; uint64_t stat_osif_malloc_success = 0; +uint64_t stat_osif_malloc_fail = 0; uint64_t stat_osif_free = 0; uint64_t stat_osif_malloc_bytes = 0; uint64_t stat_osif_free_bytes = 0; +uint64_t stat_osif_malloc_sub128k = 0; +uint64_t stat_osif_malloc_sub64k = 0; +uint64_t stat_osif_malloc_sub32k = 0; +uint64_t stat_osif_malloc_page = 0; +uint64_t stat_osif_malloc_subpage = 0; +void spl_free_set_emergency_pressure(int64_t new_p); #endif void * @@ -122,6 +127,17 @@ osif_malloc(uint64_t size) #ifdef _KERNEL void *tr = NULL; + if (size < PAGESIZE) + atomic_inc_64(&stat_osif_malloc_subpage); + else if (size == PAGESIZE) + atomic_inc_64(&stat_osif_malloc_page); + else if (size < 32768) + atomic_inc_64(&stat_osif_malloc_sub32k); + else if (size < 65536) + atomic_inc_64(&stat_osif_malloc_sub64k); + else if (size < 131072) + atomic_inc_64(&stat_osif_malloc_sub128k); + tr = ExAllocatePoolWithTag(NonPagedPoolNx, size, '!SFZ'); ASSERT(P2PHASE(tr, PAGE_SIZE) == 0); if (tr != NULL) { @@ -132,10 +148,13 @@ osif_malloc(uint64_t size) } else { dprintf("%s:%d: ExAllocatePoolWithTag failed (memusage: %llu)" "\n", __func__, __LINE__, segkmem_total_mem_allocated); + extern volatile unsigned int vm_page_free_wanted; extern volatile unsigned int vm_page_free_min; - spl_free_set_pressure(vm_page_free_min); + spl_free_set_emergency_pressure(vm_page_free_min); vm_page_free_wanted = vm_page_free_min; + + atomic_inc_64(&stat_osif_malloc_fail); return (NULL); } #else @@ -144,7 +163,7 @@ osif_malloc(uint64_t size) } void -osif_free(void *buf, uint64_t size) +osif_free(const void *buf, uint64_t size) { #ifdef _KERNEL ExFreePoolWithTag(buf, '!SFZ'); @@ -163,7 +182,13 @@ osif_free(void *buf, uint64_t size) void kernelheap_init() { - heap_arena = vmem_init("heap", NULL, 0, PAGESIZE, segkmem_alloc, + heap_arena = vmem_init("heap", NULL, 0, +#if defined(__arm64__) + 4096, +#else + PAGESIZE, +#endif + segkmem_alloc, segkmem_free); } @@ -181,7 +206,7 @@ segkmem_alloc(vmem_t *vmp, size_t size, int maybe_unmasked_vmflag) } void -segkmem_free(vmem_t *vmp, void *inaddr, size_t size) +segkmem_free(vmem_t *vmp, const void *inaddr, size_t size) { osif_free(inaddr, size); // since this is mainly called by spl_root_arena and free_arena, @@ -230,18 +255,22 @@ segkmem_abd_init() * PAGESIZE is an even multiple of at least several SPA_MINBLOCKSIZE. * This will be _Static_assert-ed in abd_os.c. */ -#if 0 // macos + abd_subpage_arena = vmem_create("abd_subpage_cache", NULL, 0, 512, vmem_alloc_impl, vmem_free_impl, abd_arena, 131072, VM_SLEEP | VMC_NO_QCACHE | VM_FIRSTFIT); VERIFY3P(abd_subpage_arena, !=, NULL); -#endif + } void segkmem_abd_fini(void) { + if (abd_subpage_arena) { + vmem_destroy(abd_subpage_arena); + } + if (abd_arena) { vmem_destroy(abd_arena); } diff --git a/module/os/windows/spl/spl-vmem.c b/module/os/windows/spl/spl-vmem.c index 03e3e702251a..eb7262ae2574 100644 --- a/module/os/windows/spl/spl-vmem.c +++ b/module/os/windows/spl/spl-vmem.c @@ -26,7 +26,7 @@ /* * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. - * Copyright (c) 2017 Sean Doran + * Copyright (c) 2017, 2021, 2023 by Sean Doran */ /* @@ -367,7 +367,9 @@ static vmem_kstat_t vmem_kstat_template = { { "parent_alloc", KSTAT_DATA_UINT64 }, { "parent_free", KSTAT_DATA_UINT64 }, { "threads_waiting", KSTAT_DATA_UINT64 }, - { "excess", KSTAT_DATA_UINT64 }, + { "excess", KSTAT_DATA_UINT64 }, + { "lowest_stack", KSTAT_DATA_UINT64 }, + { "async_stack_calls", KSTAT_DATA_UINT64 }, }; @@ -406,20 +408,14 @@ uint64_t spl_vmem_conditional_alloc_deny = 0; uint64_t spl_vmem_conditional_alloc_deny_bytes = 0; // bucket allocator kstat -uint64_t spl_xat_success = 0; -uint64_t spl_xat_late_success = 0; -uint64_t spl_xat_late_success_nosleep = 0; uint64_t spl_xat_pressured = 0; -uint64_t spl_xat_bailed = 0; -uint64_t spl_xat_bailed_contended = 0; uint64_t spl_xat_lastalloc = 0; uint64_t spl_xat_lastfree = 0; -uint64_t spl_xat_forced = 0; uint64_t spl_xat_sleep = 0; -uint64_t spl_xat_late_deny = 0; -uint64_t spl_xat_no_waiters = 0; -uint64_t spl_xft_wait = 0; +uint64_t spl_vba_fastpath = 0; +uint64_t spl_vba_fastexit = 0; +uint64_t spl_vba_slowpath = 0; uint64_t spl_vba_parent_memory_appeared = 0; uint64_t spl_vba_parent_memory_blocked = 0; uint64_t spl_vba_hiprio_blocked = 0; @@ -430,6 +426,8 @@ uint64_t spl_vba_loop_timeout_blocked = 0; uint64_t spl_vba_sleep = 0; uint64_t spl_vba_loop_entries = 0; +extern uint64_t stat_osif_malloc_fail; + // bucket minimum span size tunables uint64_t spl_bucket_tunable_large_span = 0; uint64_t spl_bucket_tunable_small_span = 0; @@ -451,7 +449,15 @@ extern void spl_free_set_emergency_pressure(int64_t p); extern uint64_t segkmem_total_mem_allocated; extern uint64_t total_memory; -_Atomic uint64_t spl_lowest_alloc_stack_remaining = 0; +extern uint64_t spl_enforce_memory_caps; +extern _Atomic uint64_t spl_dynamic_memory_cap; +extern hrtime_t spl_dynamic_memory_cap_last_downward_adjust; +extern kmutex_t spl_dynamic_memory_cap_lock; +extern uint64_t spl_dynamic_memory_cap_reductions; +extern uint64_t spl_dynamic_memory_cap_hit_floor; + +#define INITIAL_BLOCK_SIZE 16ULL*1024ULL*1024ULL +static char *initial_default_block = NULL; /* * Get a vmem_seg_t from the global segfree list. @@ -626,6 +632,9 @@ vmem_freelist_insert_sort_by_time(vmem_t *vmp, vmem_seg_t *vsp) ASSERT(vsp->vs_span_createtime != 0); if (vsp->vs_span_createtime == 0) { + dprintf("SPL: %s: WARNING: " + "vsp->vs_span_createtime == 0 (%s)!\n", + __func__, vmp->vm_name); TraceEvent(TRACE_WARNING, "SPL: %s: WARNING: " "vsp->vs_span_createtime == 0 (%s)!\n", __func__, vmp->vm_name); @@ -1306,18 +1315,51 @@ spl_vmem_xnu_useful_bytes_free(void) extern _Atomic uint32_t spl_vm_pages_wanted; extern _Atomic uint32_t spl_vm_pressure_level; - if (spl_vm_pages_wanted > 0) - return (PAGE_SIZE * spl_vm_pages_reclaimed); + /* carve out a small reserve for unconditional allocs */ + const uint64_t reserve = total_memory >> 9ULL; + const uint64_t total_minus_reserve = total_memory - reserve; + + /* + * pages are wanted *and* we are in our reserve area, + * so we report only one page of "usable" memory. + * + * if we are below the reserve, return the amount left + */ + + if (spl_vm_pages_wanted > 0) { + if (segkmem_total_mem_allocated >= total_minus_reserve) + return (PAGE_SIZE * MAX(spl_vm_pages_reclaimed, 1)); + else + return (total_minus_reserve - + (segkmem_total_mem_allocated + + PAGE_SIZE * spl_vm_pages_reclaimed)); + } /* + * If there is pressure, and we are in the reserve area, + * then there is no "usable" memory, unless we have reclaimed + * some pages. + * * beware of large magic guard values, - * the pressure enum only goes to 4 + * the pressure enum only goes to 4. */ + if (spl_vm_pressure_level > 0 && - spl_vm_pressure_level < 100) - return (0); + spl_vm_pressure_level < 100) { + if (spl_vm_pages_reclaimed > 0) + return (PAGE_SIZE * spl_vm_pages_reclaimed); + else if (segkmem_total_mem_allocated < total_minus_reserve) + return (PAGE_SIZE); + else + return (0); + } - return (total_memory - segkmem_total_mem_allocated); + /* + * No pressure: return non-reserved bytes not allocated. + * The reserve may be needed for VM_NOWAIT and VM_PANIC flags. + */ + + return (total_minus_reserve - segkmem_total_mem_allocated); } uint64_t @@ -1336,53 +1378,21 @@ spl_vmem_malloc_unconditionally_unlocked(size_t size) return (osif_malloc(size)); } -static void * -spl_vmem_malloc_unconditionally(size_t size) -{ - mutex_enter(&vmem_xnu_alloc_lock); - void *m = spl_vmem_malloc_unconditionally_unlocked(size); - mutex_exit(&vmem_xnu_alloc_lock); - return (m); -} - -static void * -spl_vmem_malloc_if_no_pressure(size_t size) -{ - // The mutex serializes concurrent callers, providing time for - // the variables in spl_vmem_xnu_useful_bytes_free() to be updated. - mutex_enter(&vmem_xnu_alloc_lock); - if (spl_vmem_xnu_useful_bytes_free() > (MAX(size, 1024ULL*1024ULL))) { - extern void *osif_malloc(uint64_t); - void *p = osif_malloc(size); - if (p != NULL) { - spl_vmem_conditional_allocs++; - spl_vmem_conditional_alloc_bytes += size; - } - mutex_exit(&vmem_xnu_alloc_lock); - return (p); - } else { - spl_vmem_conditional_alloc_deny++; - spl_vmem_conditional_alloc_deny_bytes += size; - mutex_exit(&vmem_xnu_alloc_lock); - return (NULL); - } -} - /* * Allocate size bytes at offset phase from an align boundary such that the * resulting segment [addr, addr + size) is a subset of [minaddr, maxaddr) * that does not straddle a nocross-aligned boundary. */ -void * +inline void * vmem_xalloc(vmem_t *vmp, size_t size, size_t align_arg, size_t phase, size_t nocross, void *minaddr, void *maxaddr, int vmflag) { vmem_seg_t *vsp; vmem_seg_t *vbest = NULL; - uintptr_t addr, taddr, start, end; + uintptr_t addr = 0, taddr, start, end; uintptr_t align = (align_arg != 0) ? align_arg : vmp->vm_quantum; void *vaddr, *xvaddr = NULL; - size_t xsize; + size_t xsize = 0; int hb, flist, resv; uint32_t mtbf; @@ -1541,7 +1551,7 @@ vmem_xalloc(vmem_t *vmp, size_t size, size_t align_arg, size_t phase, vmp->vm_nsegfree -= resv; /* reserve our segs */ mutex_exit(&vmp->vm_lock); if (vmp->vm_cflags & VMC_XALLOC) { - size_t oasize = asize; + ASSERTV(size_t oasize = asize); vaddr = ((vmem_ximport_t *) vmp->vm_source_alloc)(vmp->vm_source, &asize, align, vmflag & VM_KMFLAGS); @@ -1689,7 +1699,7 @@ vmem_xalloc(vmem_t *vmp, size_t size, size_t align_arg, size_t phase, * both routines bypass the quantum caches. */ void -vmem_xfree(vmem_t *vmp, void *vaddr, size_t size) +vmem_xfree(vmem_t *vmp, const void *vaddr, size_t size) { vmem_seg_t *vsp, *vnext, *vprev; @@ -1733,7 +1743,8 @@ vmem_xfree(vmem_t *vmp, void *vaddr, size_t size) vmem_span_destroy(vmp, vsp); vmp->vm_kstat.vk_parent_free.value.ui64++; mutex_exit(&vmp->vm_lock); - vmp->vm_source_free(vmp->vm_source, vaddr, size); + vmp->vm_source_free(vmp->vm_source, + __DECONST(void *, vaddr), size); } else { vmem_freelist_insert(vmp, vsp); mutex_exit(&vmp->vm_lock); @@ -1749,6 +1760,18 @@ vmem_xfree(vmem_t *vmp, void *vaddr, size_t size) * instead of the default instant-fit policy. VM_SLEEP allocations are * guaranteed to succeed. */ +/* + * If there is less space on the kernel stack than + * (dynamically tunable) spl_split_stack_below + * then perform the vmem_alloc in the thread_call + * function. Don't set it to 16384, because then it + * continuously triggers, and we hang. + */ +unsigned long spl_split_stack_below = 8192; + +/* kstat tracking the global minimum free stack space */ +_Atomic unsigned int spl_lowest_alloc_stack_remaining = UINT_MAX; + void * vmem_alloc_impl(vmem_t *vmp, size_t size, int vmflag) { @@ -1816,7 +1839,7 @@ vmem_alloc_impl(vmem_t *vmp, size_t size, int vmflag) * Free the segment [vaddr, vaddr + size). */ void -vmem_free_impl(vmem_t *vmp, void *vaddr, size_t size) +vmem_free_impl(vmem_t *vmp, const void *vaddr, size_t size) { if (size - 1 < vmp->vm_qcache_max) kmem_cache_free(vmp->vm_qcache[(size - 1) >> vmp->vm_qshift], @@ -1990,7 +2013,7 @@ spl_vmem_size(vmem_t *vmp, int typemask) static vmem_t * vmem_create_common(const char *name, void *base, size_t size, size_t quantum, void *(*afunc)(vmem_t *, size_t, int), - void (*ffunc)(vmem_t *, void *, size_t), + void (*ffunc)(vmem_t *, const void *, size_t), vmem_t *source, size_t qcache_max, int vmflag) { int i; @@ -2063,7 +2086,7 @@ vmem_create_common(const char *name, void *base, size_t size, size_t quantum, vmp->vm_kstat.vk_source_id.value.ui32 = source->vm_id; vmp->vm_source = source; vmp->vm_source_alloc = afunc; - vmp->vm_source_free = ffunc; + vmp->vm_source_free = __DECONST(void *, ffunc); /* * Some arenas (like vmem_metadata and kmem_metadata) cannot @@ -2269,39 +2292,78 @@ int vmem_rescale_minshift = 3; /* * Resize vmp's hash table to keep the average lookup depth near 1.0. + * + * The decision to exit early, before allocating a new table, is done outside + * a mutex lock. The calculation of memory that should be allocated and the + * allocation itself is also done outside the lock. The allocation CANNOT be + * safely done under this mutex, and there is no reason to lock the subsequent + * memset. + * + * However, another thread (including ones possible awakened by the + * cv_broadcast() in our caller vmem_update()) can change the number of bytes + * allcated or freed in our vmem arena; enough of a downward change (e.g. from + * reaping after a reduction of ARC frees many scatter ABDs) will cause our + * previous outside-the-lock new_table allocation to be the wrong size, + * potentially leading to a loss of information about vmem_alloc_impl() + * allocations made before we acquire vmp->vm_lock. In turn, this leads + * to a panic when doing a vmem_free_impl() on an improperly-recorded segment. + * + * Consequently once we hold vmp->vm_lock we must recalculate new_size and + * compare that with the previously-calculated nolock_new_size. If they do + * not match we must clean up and return rather than attempt to use new_table. */ static void vmem_hash_rescale(vmem_t *vmp) { - vmem_seg_t **old_table, **new_table, *vsp; - size_t old_size, new_size, h, nseg; + vmem_seg_t **new_table, *vsp; - nseg = (size_t)(vmp->vm_kstat.vk_alloc.value.ui64 - + const size_t nolock_nseg = + (size_t)(vmp->vm_kstat.vk_alloc.value.ui64 - vmp->vm_kstat.vk_free.value.ui64); - new_size = MAX(VMEM_HASH_INITIAL, 1 << (highbit(3 * nseg + 4) - 2)); - old_size = vmp->vm_hash_mask + 1; + const size_t nolock_new_size = MAX(VMEM_HASH_INITIAL, + 1 << (highbit(3 * nolock_nseg + 4) - 2)); + const size_t nolock_old_size = vmp->vm_hash_mask + 1; - if ((old_size >> vmem_rescale_minshift) <= new_size && - new_size <= (old_size << 1)) + if ((nolock_old_size >> vmem_rescale_minshift) <= nolock_new_size && + nolock_new_size <= (nolock_old_size << 1)) return; - new_table = vmem_alloc_impl(vmem_hash_arena, new_size * sizeof (void *), + new_table = vmem_alloc_impl(vmem_hash_arena, + nolock_new_size * sizeof (void *), VM_NOSLEEP); if (new_table == NULL) return; - memset(new_table, 0, new_size * sizeof (void *)); + memset(new_table, 0, nolock_new_size * sizeof (void *)); mutex_enter(&vmp->vm_lock); - old_size = vmp->vm_hash_mask + 1; - old_table = vmp->vm_hash_table; + const size_t nseg = (size_t)(vmp->vm_kstat.vk_alloc.value.ui64 - + vmp->vm_kstat.vk_free.value.ui64); + + const size_t new_size = MAX(VMEM_HASH_INITIAL, + 1 << (highbit(3 * nseg + 4) - 2)); + + if (new_size != nolock_new_size) { + dprintf("ZFS: SPL: %s:%d:%s:" + " race condition found: %s, %ld, %ld\n", + __FILE__, __LINE__, __func__, + vmp->vm_name, + nolock_new_size, new_size); + mutex_exit(&vmp->vm_lock); + vmem_free_impl(vmem_hash_arena, new_table, + nolock_new_size * sizeof (void *)); + return; + } + + const size_t old_size = vmp->vm_hash_mask + 1; + vmem_seg_t **old_table = vmp->vm_hash_table; vmp->vm_hash_mask = new_size - 1; vmp->vm_hash_table = new_table; vmp->vm_hash_shift = highbit(vmp->vm_hash_mask); - for (h = 0; h < old_size; h++) { + for (size_t h = 0; h < old_size; h++) { vsp = old_table[h]; while (vsp != NULL) { uintptr_t addr = vsp->vs_start; @@ -2335,8 +2397,15 @@ vmem_update(void *dummy) * If threads are waiting for resources, wake them up * periodically so they can issue another kmem_reap() * to reclaim resources cached by the slab allocator. + * + * In general it is good practice to take the associated + * lock before calling cv_broadcast(). Here it gives any + * waiters a good shot at the lock that may be (re-)taken + * by this thread in vmem_hash_rescale() function. */ + mutex_enter(&vmp->vm_lock); cv_broadcast(&vmp->vm_cv); + mutex_exit(&vmp->vm_lock); /* * Rescale the hash table to keep the hash chains short. @@ -2383,7 +2452,7 @@ vmem_bucket_number(size_t size) if (bucket < 0) bucket = 0; - return (bucket); + return ((int16_t)bucket); } static inline vmem_t * @@ -2400,353 +2469,125 @@ spl_vmem_bucket_arena_by_size(size_t size) return (vmem_bucket_arena_by_size(size)); } +/* + * We have just freed memory back to Windows so we let any waiters on the + * lowest-level bucket arenas know they have a chance to make progress in + * their hunt for memory from the operating system. We then tell the heap that + * there may be memory freshly imported into the buckets. + * + * This function broadcasts to waiters on the smallest-span buckets first, and + * because of mutex-ordering this biases towards small-allocation kmem caches. + */ static inline void vmem_bucket_wake_all_waiters(void) { for (int i = VMEM_BUCKET_LOWBIT; i < VMEM_BUCKET_HIBIT; i++) { const int bucket = i - VMEM_BUCKET_LOWBIT; vmem_t *bvmp = vmem_bucket_arena[bucket]; + mutex_enter(&bvmp->vm_lock); cv_broadcast(&bvmp->vm_cv); + mutex_exit(&bvmp->vm_lock); } + mutex_enter(&spl_heap_arena->vm_lock); cv_broadcast(&spl_heap_arena->vm_cv); -} - -/* - * xnu_alloc_throttled_bail() : spin looking for memory - * - */ - -static inline void * -xnu_alloc_throttled_bail(uint64_t now_ticks, vmem_t *calling_vmp, - size_t size, int vmflags) -{ - // spin looking for memory - const uint64_t bigtarget = MAX(size, 16ULL*1024ULL*1024ULL); - static volatile _Atomic bool alloc_lock = false; - static volatile _Atomic uint64_t force_time = 0; - - uint64_t timeout_ticks = hz / 2; - if (vmflags & VM_PUSHPAGE) - timeout_ticks = hz / 4; - - uint64_t timeout_time = now_ticks + timeout_ticks; - - for (uint32_t suspends = 0, blocked_suspends = 0, - try_no_pressure = 0; /* empty */; /* empty */) { - if (force_time + timeout_ticks > timeout_time) { - // another thread has forced an allocation - // by timing out. push our deadline into the future. - timeout_time = force_time + timeout_ticks; - } - if (alloc_lock) { - blocked_suspends++; - IOSleep(1); - } else if (spl_vmem_xnu_useful_bytes_free() >= bigtarget) { - bool f = false; - // if alloc_lock == f then alloc_lock = true and result - // is true otherwise result is false and f = true - if (!__c11_atomic_compare_exchange_strong(&alloc_lock, - &f, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED)) { - /* - * avoid (highly unlikely) data race on - * alloc_lock. if alloc_lock has become true - * while we were in the else if expression - * then we effectively optimize away the - * (relaxed) load of alloc_lock (== true) - * into f and continue. - */ - continue; - } - // alloc_lock is now visible as true to all threads - try_no_pressure++; - void *m = spl_vmem_malloc_if_no_pressure(size); - if (m != NULL) { - uint64_t ticks = zfs_lbolt() - now_ticks; - dprintf("SPL: %s returning %llu bytes after " - "%llu ticks (hz=%u, seconds = %llu), " - "%u suspends, %u blocked, %u tries (%s)\n", - __func__, (uint64_t)size, - ticks, hz, ticks/hz, suspends, - blocked_suspends, try_no_pressure, - calling_vmp->vm_name); - // atomic seq cst, so is published to all - // threads - alloc_lock = false; - return (m); - } else { - alloc_lock = false; - spl_free_set_emergency_pressure(bigtarget); - suspends++; - IOSleep(1); - } - } else if (zfs_lbolt() > timeout_time) { - bool f = false; - if (!__c11_atomic_compare_exchange_strong(&alloc_lock, - &f, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED)) { - // avoid (highly unlikely) data race on - // alloc_lock as above - continue; - } - void *mp = spl_vmem_malloc_unconditionally(size); - uint64_t now = zfs_lbolt(); - uint64_t ticks = now - now_ticks; - force_time = now; - dprintf("SPL: %s TIMEOUT %llu bytes after " - "%llu ticks (hz=%u, seconds=%llu), " - "%u suspends, %u blocked, %u tries (%s)\n", - __func__, (uint64_t)size, - ticks, hz, ticks/hz, suspends, - blocked_suspends, try_no_pressure, - calling_vmp->vm_name); - alloc_lock = false; - atomic_inc_64(&spl_xat_forced); - return (mp); - } else { - spl_free_set_emergency_pressure(bigtarget); - suspends++; - IOSleep(1); - } - } + mutex_exit(&spl_heap_arena->vm_lock); } static void * xnu_alloc_throttled(vmem_t *bvmp, size_t size, int vmflag) { - // the caller is one of the bucket arenas. - // null_vmp will be spl_default_arena_parent, which is - // just a placeholder. - - uint64_t now = zfs_lbolt(); - const uint64_t entry_now = now; + static volatile _Atomic uint64_t fail_at = 0; + static volatile _Atomic int16_t success_ct = 0; - void *m = spl_vmem_malloc_if_no_pressure(size); + void *p = spl_vmem_malloc_unconditionally_unlocked(size); - if (m != NULL) { - atomic_inc_64(&spl_xat_success); - spl_xat_lastalloc = gethrtime(); - // wake up waiters on all the arena condvars - // since there is apparently no memory shortage. - vmem_bucket_wake_all_waiters(); - return (m); - } else { - spl_free_set_emergency_pressure((int64_t)size); - } - - if (vmflag & VM_PANIC) { - // force an allocation now to avoid a panic + if (p != NULL) { + /* grow fail_at periodically */ + if (success_ct++ >= 128) { + fail_at += size; + success_ct = 0; + } spl_xat_lastalloc = gethrtime(); - spl_free_set_emergency_pressure(4LL * (int64_t)size); - void *p = spl_vmem_malloc_unconditionally(size); - // p cannot be NULL (unconditional kernel malloc always works - // or panics) - // therefore: success, wake all waiters on alloc|free condvar - // wake up arena waiters to let them know there is memory - // available in the arena; let waiters on other bucket arenas - // continue sleeping. cv_broadcast(&bvmp->vm_cv); return (p); } - if (vmflag & VM_NOSLEEP) { - spl_free_set_emergency_pressure(MAX(2LL * (int64_t)size, - 16LL*1024LL*1024LL)); - /* cheating a bit, but not really waiting */ - kpreempt(KPREEMPT_SYNC); - void *p = spl_vmem_malloc_if_no_pressure(size); - if (p != NULL) { - atomic_inc_64(&spl_xat_late_success_nosleep); - cv_broadcast(&bvmp->vm_cv); - spl_xat_lastalloc = gethrtime(); - } - // if p == NULL, then there will be an increment in - // the fail kstat - return (p); - } + success_ct = 0; + fail_at = segkmem_total_mem_allocated - size; /* - * Loop for a while trying to satisfy VM_SLEEP allocations. - * - * If we are able to allocate memory, then return the pointer. - * - * We return NULL if some other thread's activity has caused - * sufficient memory to appear in this arena that we can satisfy - * the allocation. - * - * We call xnu_alloc_throttle_bail() after a few milliseconds of - * waiting; it will either return a pointer to newly allocated - * memory or NULL. We return the result. + * adjust dynamic memory cap downwards by 1/32 (~ 3%) of total_memory + * but do not drop below 1/8 of total_memory.. * + * see also spl-kmem.c:spl_reduce_dynamic_cap(), which is + * triggered by ARC or other clients inquiring about spl_free() */ + if (spl_enforce_memory_caps != 0 && + (fail_at < spl_dynamic_memory_cap || + spl_dynamic_memory_cap == 0)) { + mutex_enter(&spl_dynamic_memory_cap_lock); + + spl_dynamic_memory_cap_last_downward_adjust = gethrtime(); + const int64_t thresh = total_memory >> 3; + const int64_t below_fail_at = fail_at - (total_memory >> 5); + const int64_t reduced = MAX(below_fail_at, thresh); + + if (spl_dynamic_memory_cap == 0 || + spl_dynamic_memory_cap >= total_memory) { + spl_dynamic_memory_cap = reduced; + atomic_inc_64(&spl_dynamic_memory_cap_reductions); + } else if (thresh > spl_dynamic_memory_cap) { + spl_dynamic_memory_cap = thresh; + atomic_inc_64(&spl_dynamic_memory_cap_hit_floor); + } else { + spl_dynamic_memory_cap = reduced; + atomic_inc_64(&spl_dynamic_memory_cap_reductions); + } - const uint32_t bucket_number = - vmem_bucket_id_to_bucket_number[bvmp->vm_id]; - static volatile _Atomic uint32_t waiters = 0; - - waiters++; - - if (waiters == 1UL) - atomic_inc_64(&spl_xat_no_waiters); - - static _Atomic uint32_t max_waiters_seen = 0; - - if (waiters > max_waiters_seen) { - max_waiters_seen = waiters; - dprintf("SPL: %s: max_waiters_seen increased to %u\n", __func__, - max_waiters_seen); + mutex_exit(&spl_dynamic_memory_cap_lock); } - boolean_t local_xat_pressured = false; + /* wait until used memory falls below failure_at */ - for (; /* empty */; /* empty */) { - clock_t wait_time = USEC2NSEC(500UL * MAX(waiters, 1UL)); - mutex_enter(&bvmp->vm_lock); - spl_xat_sleep++; - if (local_xat_pressured) { - spl_xat_pressured++; - local_xat_pressured = false; - } - (void) cv_timedwait_hires(&bvmp->vm_cv, &bvmp->vm_lock, - wait_time, 0, 0); - mutex_exit(&bvmp->vm_lock); - now = zfs_lbolt(); - // We may be here because of a broadcast to &vmp->vm_cv, - // causing xnu to schedule all the sleepers in priority-weighted - // FIFO order. Because of the mutex_exit(), the sections below - // here may be entered concurrently. - // spl_vmem_malloc_if_no_pressure does a mutex, so avoid calling - // it unless there is a chance it will succeed. - if (spl_vmem_xnu_useful_bytes_free() > (MAX(size, - 16ULL*1024ULL*1024ULL))) { - void *a = spl_vmem_malloc_if_no_pressure(size); - if (a != NULL) { - atomic_inc_64(&spl_xat_late_success); - spl_xat_lastalloc = gethrtime(); - waiters--; - // Wake up all waiters on the bucket arena - // locks, since the system apparently has - // memory again. - vmem_bucket_wake_all_waiters(); - return (a); - } else { - // Probably spl_vm_page_free_count changed while - // we were in the mutex queue in - // spl_vmem_malloc_if_no_pressure(). There is - // therefore no point in doing the bail-out - // check below, so go back to the top of the - // for loop. - atomic_inc_64(&spl_xat_late_deny); - continue; - } - } - if (now > entry_now + hz / 4 || - spl_vba_threads[bucket_number] > 1UL) { - // If there are other threads waiting for us - // in vba() then when we satisfy this allocation, - // we satisfy more than one thread, so invoke XATB(). - // Otherwise, if we have had no luck for 250 ms, then - // switch to XATB() which is much more aggressive. - if (spl_vba_threads[bucket_number] > 1UL) - atomic_inc_64(&spl_xat_bailed_contended); - atomic_inc_64(&spl_xat_bailed); - static _Atomic uint32_t bailing_threads = 0; - static _Atomic uint32_t max_bailers_seen = 0; - bailing_threads++; - if (bailing_threads > max_bailers_seen) { - max_bailers_seen = bailing_threads; - dprintf("SPL: %s: max_bailers_seen increased " - "to %u\n", __func__, max_bailers_seen); - } - void *b = - xnu_alloc_throttled_bail(now, bvmp, size, vmflag); - bailing_threads--; - spl_xat_lastalloc = gethrtime(); - // wake up waiters on the arena lock, - // since they now have memory they can use. - cv_broadcast(&bvmp->vm_cv); - // open turnstile after having bailed, rather - // than before - waiters--; - return (b); - } else if (now - entry_now > 0 && - ((now - entry_now) % (hz/10))) { - spl_free_set_emergency_pressure(MAX(size, - 16LL*1024LL*1024LL)); - local_xat_pressured = true; + extern void spl_set_arc_no_grow(int); + spl_set_arc_no_grow(B_TRUE); + spl_free_set_emergency_pressure(total_memory >> 7LL); + atomic_inc_64(&spl_xat_pressured); + if ((vmflag & (VM_NOSLEEP | VM_PANIC | VM_ABORT)) > 0) + return (NULL); + + for (uint64_t loop_for_mem = 1; ; loop_for_mem++) { + // ASSERT3U((loop_for_mem % 10), ==, 0); // 1 second bleat beat + IOSleep(100); /* hope someone frees memory */ + /* only try to allocate if there is memory */ + if (fail_at > segkmem_total_mem_allocated) { + p = spl_vmem_malloc_unconditionally_unlocked(size); + if (p != NULL) + return (p); + } else { + /* abuse existing kstat */ + atomic_inc_64(&spl_xat_sleep); } + success_ct = 0; + const uint64_t x = segkmem_total_mem_allocated - size; + if (fail_at > x) + fail_at = x; + spl_set_arc_no_grow(B_TRUE); + spl_free_set_emergency_pressure(total_memory >> 7LL); + atomic_inc_64(&spl_xat_pressured); + /* after ten seconds, just return NULL */ + if (loop_for_mem > 100) + return (NULL); } } static void -xnu_free_throttled(vmem_t *vmp, void *vaddr, size_t size) +xnu_free_throttled(vmem_t *vmp, const void *vaddr, size_t size) { - extern void osif_free(void *, uint64_t); + extern void osif_free(const void *, uint64_t); - // Serialize behind a (short) spin-sleep delay, giving - // xnu time to do freelist management and - // PT teardowns - - // In the usual case there is only one thread in this function, - // so we proceed waitlessly to osif_free(). - - // When there are multiple threads here, we delay the 2nd and later. - - // Explict race: - // The osif_free() is not protected by the vmem_xnu_alloc_lock - // mutex; that is just used for implementing the delay. Consequently, - // the waiters on the same lock in spl_vmem_malloc_if_no_pressure may - // falsely see too small a value for spl_vm_page_free_count. We don't - // care in part because xnu performs poorly when doing - // free-then-allocate anwyay. - - // a_waiters gauges the loop exit checking and sleep duration; - // it is a count of the number of threads trying to do work - // in this function. - static volatile _Atomic uint32_t a_waiters = 0; - - // is_freeing protects the osif_free() call; see comment below - static volatile _Atomic bool is_freeing = false; - - a_waiters++; // generates "lock incl ..." - - static _Atomic uint32_t max_waiters_seen = 0; - - if (a_waiters > max_waiters_seen) { - max_waiters_seen = a_waiters; - dprintf("SPL: %s: max_waiters_seen increased to %u\n", - __func__, max_waiters_seen); - } - - for (uint32_t iter = 0; a_waiters > 1UL; iter++) { - // there is more than one thread here, so suspend and - // sleep for 1 ms - atomic_inc_64(&spl_xft_wait); - IOSleep(1); - // If are growing old in this loop, then see if - // anyone else is still in osif_free. If not, - // we can exit. - if (iter >= a_waiters) { - // if is_freeing == f, then set is_freeing to true with - // release semantics (i.e. "push" it to other cores) - // then break; otherwise, set f to true relaxedly (i.e., - // optimize it out) - bool f = false; - if (__c11_atomic_compare_exchange_weak(&is_freeing, - &f, true, __ATOMIC_RELEASE, __ATOMIC_RELAXED)) { - break; - } - } - } - // If there is more than one thread in this function, osif_free() is - // protected by is_freeing. Release it after the osif_free() - // call has been made and the lastfree bookkeeping has been done. osif_free(vaddr, size); spl_xat_lastfree = gethrtime(); - is_freeing = false; - a_waiters--; - kpreempt(KPREEMPT_SYNC); - // since we just gave back xnu enough to satisfy an allocation - // in at least the smaller buckets, let's wake up anyone in - // the cv_wait() in vmem_xalloc([bucket_#], ...) vmem_bucket_wake_all_waiters(); } @@ -2797,13 +2638,30 @@ vmem_bucket_alloc(vmem_t *null_vmp, size_t size, const int vmflags) vmem_t *bvmp = vmem_bucket_arena_by_size(size); + void *fastm = vmem_alloc_impl(bvmp, size, + local_hipriority_allocator ? vmflags : vmflags | VM_BESTFIT); + + if (fastm != NULL) { + atomic_inc_64(&spl_vba_fastpath); + cv_broadcast(&calling_arena->vm_cv); + return (fastm); + } else if ((vmflags & (VM_NOSLEEP | VM_PANIC | VM_ABORT)) > 0) { + atomic_inc_64(&spl_vba_fastexit); + return (NULL); + } + + atomic_inc_64(&spl_vba_slowpath); + + /* work harder to avoid an allocation */ + const int slow_vmflags = vmflags | VM_BESTFIT; + // there are 13 buckets, so use a 16-bit scalar to hold // a set of bits, where each bit corresponds to an in-progress // vmem_alloc_impl(bucket, ...) below. static volatile _Atomic uint16_t buckets_busy_allocating = 0; const uint16_t bucket_number = vmem_bucket_number(size); - const uint16_t bucket_bit = (uint16_t)1 << bucket_number; + const uint16_t bucket_bit = (uint16_t)(1 << bucket_number); spl_vba_threads[bucket_number]++; @@ -2814,12 +2672,13 @@ vmem_bucket_alloc(vmem_t *null_vmp, size_t size, const int vmflags) bool loop_once = false; - if ((vmflags & (VM_NOSLEEP | VM_PANIC | VM_ABORT)) == 0 && + if ((slow_vmflags & (VM_NOSLEEP | VM_PANIC | VM_ABORT)) == 0 && ! vmem_canalloc_atomic(bvmp, size)) { if (spl_vmem_xnu_useful_bytes_free() < (MAX(size, 16ULL*1024ULL*1024ULL))) { - spl_free_set_emergency_pressure(size); - IOSleep(1); + spl_free_set_emergency_pressure( + total_memory >> 7LL); + IOSleep(2); if (!vmem_canalloc_atomic(bvmp, size) && (spl_vmem_xnu_useful_bytes_free() < (MAX(size, 16ULL*1024ULL*1024ULL)))) { @@ -2875,11 +2734,11 @@ vmem_bucket_alloc(vmem_t *null_vmp, size_t size, const int vmflags) loop_once = false; // non-waiting allocations should proceeed to vmem_alloc_impl() // immediately - if (vmflags & (VM_NOSLEEP | VM_PANIC | VM_ABORT)) { + if (slow_vmflags & (VM_NOSLEEP | VM_PANIC | VM_ABORT)) { break; } if (vmem_canalloc_atomic(bvmp, size)) { - // We can probably vmem_alloc(bvmp, size, vmflags). + // We can probably vmem_alloc(bvmp, size, slow_vmflags). // At worst case it will give us a NULL and we will // end up on the vmp's cv_wait. // @@ -2990,7 +2849,7 @@ vmem_bucket_alloc(vmem_t *null_vmp, size_t size, const int vmflags) timedout |= 2; extern uint64_t real_total_memory; spl_free_set_emergency_pressure( - real_total_memory / 64LL); + total_memory >> 7LL); // flush the current thread in xat() out of // xat()'s for() loop and into xat_bail() cv_broadcast(&bvmp->vm_cv); @@ -3083,7 +2942,7 @@ vmem_bucket_alloc(vmem_t *null_vmp, size_t size, const int vmflags) // because waiters was 0 when we entered this function, // subsequent callers will enter the for loop. - void *m = vmem_alloc_impl(bvmp, size, vmflags); + void *m = vmem_alloc_impl(bvmp, size, slow_vmflags); // allow another vmem_canalloc() through for this bucket // by atomically turning off the appropriate bit @@ -3118,7 +2977,7 @@ vmem_bucket_alloc(vmem_t *null_vmp, size_t size, const int vmflags) } static void -vmem_bucket_free(vmem_t *null_vmp, void *vaddr, size_t size) +vmem_bucket_free(vmem_t *null_vmp, const void *vaddr, size_t size) { vmem_t *calling_arena = spl_heap_arena; @@ -3150,7 +3009,7 @@ vmem_buckets_size(int typemask) { int64_t total_size = 0; - for (int i = 0; i < VMEM_BUCKETS; i++) { + for (uint16_t i = 0; i < VMEM_BUCKETS; i++) { int64_t u = vmem_bucket_arena_used(i); int64_t f = vmem_bucket_arena_free(i); if (typemask & VMEM_ALLOC) @@ -3277,9 +3136,9 @@ spl_vmem_default_alloc(vmem_t *vmp, size_t size, int vmflags) } static void -spl_vmem_default_free(vmem_t *vmp, void *vaddr, size_t size) +spl_vmem_default_free(vmem_t *vmp, const void *vaddr, size_t size) { - extern void osif_free(void *, uint64_t); + extern void osif_free(const void *, uint64_t); osif_free(vaddr, size); } @@ -3287,7 +3146,7 @@ vmem_t * vmem_init(const char *heap_name, void *heap_start, size_t heap_size, size_t heap_quantum, void *(*heap_alloc)(vmem_t *, size_t, int), - void (*heap_free)(vmem_t *, void *, size_t)) + void (*heap_free)(vmem_t *, const void *, size_t)) { uint32_t id; int nseg = VMEM_SEG_INITIAL; @@ -3313,7 +3172,7 @@ vmem_init(const char *heap_name, vmem_putseg_global(&vmem_seg0[nseg]); /* - * On OSX we ultimately have to use the OS allocator + * On Windows we ultimately have to use the OS allocator * as the ource and sink of memory as it is allocated * and freed. * @@ -3334,8 +3193,13 @@ vmem_init(const char *heap_name, // Intel can go with 4096 alignment, but arm64 needs 16384. So // we just use the larger. // turns out that Windows refuses alignment over 8192 - __declspec(align(PAGE_SIZE)) static char - initial_default_block[16ULL * 1024ULL * 1024ULL] = { 0 }; + // __declspec(align(PAGE_SIZE)) static char + // initial_default_block[INITIAL_BLOCK_SIZE] = { 0 }; + // ExAllocatePoolWithTag() + // If NumberOfBytes is PAGE_SIZE or greater, a page-aligned buffer + // is allocated + MALLOC(initial_default_block, void *, INITIAL_BLOCK_SIZE, M_TEMP, + M_WAITOK); // The default arena is very low-bandwidth; it supplies the initial // large allocation for the heap arena below, and it serves as the @@ -3343,9 +3207,9 @@ vmem_init(const char *heap_name, // or 3 parent_alloc calls (to spl_vmem_default_alloc) in total. spl_default_arena = vmem_create("spl_default_arena", // id 1 - initial_default_block, 16ULL*1024ULL*1024ULL, + initial_default_block, INITIAL_BLOCK_SIZE, heap_quantum, spl_vmem_default_alloc, spl_vmem_default_free, - spl_default_arena_parent, 16ULL*1024ULL*1024ULL, + spl_default_arena_parent, 32, /* minimum import */ VM_SLEEP | VMC_POPULATOR | VMC_NO_QCACHE); VERIFY(spl_default_arena != NULL); @@ -3377,17 +3241,15 @@ vmem_init(const char *heap_name, extern uint64_t real_total_memory; VERIFY3U(real_total_memory, >=, 1024ULL*1024ULL*1024ULL); - // adjust minimum bucket span size for memory size - // see comments in the switch below - // large span: 1 MiB and bigger on large-memory (> 32 GiB) systems - // small span: 256 kiB and bigger on large-memory systems - const uint64_t k = 1024ULL; - const uint64_t qm = 256ULL * k; - const uint64_t m = 1024ULL* k; - const uint64_t big = MAX(real_total_memory / (k * 32ULL), m); - const uint64_t small = MAX(real_total_memory / (k * 128ULL), qm); - spl_bucket_tunable_large_span = MIN(big, 16ULL * m); - spl_bucket_tunable_small_span = small; + /* + * Minimum bucket span size, which is what we ask IOMallocAligned for. + * See comments in the switch statement below. + * + * By default ask the kernel for at least 128kiB allocations. + */ + spl_bucket_tunable_large_span = spl_bucket_tunable_small_span = + 128ULL * 1024UL; + dprintf("SPL: %s: real_total_memory %llu, large spans %llu, small " "spans %llu\n", __func__, real_total_memory, spl_bucket_tunable_large_span, spl_bucket_tunable_small_span); @@ -3397,8 +3259,10 @@ vmem_init(const char *heap_name, for (int32_t i = VMEM_BUCKET_LOWBIT; i <= VMEM_BUCKET_HIBIT; i++) { const uint64_t bucket_largest_size = (1ULL << (uint64_t)i); + (void) snprintf(buf, VMEM_NAMELEN + 20, "%s_%llu", "bucket", bucket_largest_size); + dprintf("SPL: %s creating arena %s (i == %d)\n", __func__, buf, i); const int bucket_number = i - VMEM_BUCKET_LOWBIT; @@ -3411,13 +3275,14 @@ vmem_init(const char *heap_name, * bucket_heap arena. */ vmem_t *b = vmem_create(buf, NULL, 0, - // MAX(heap_quantum, bucket_largest_size), heap_quantum, xnu_alloc_throttled, xnu_free_throttled, spl_default_arena_parent, - MAX(heap_quantum * 8, bucket_largest_size * 2), + 32, /* minimum import */ VM_SLEEP | VMC_POPULATOR | VMC_NO_QCACHE | VMC_TIMEFREE); + VERIFY(b != NULL); + b->vm_source = b; vmem_bucket_arena[bucket_number] = b; vmem_bucket_id_to_bucket_number[b->vm_id] = bucket_number; @@ -3443,17 +3308,12 @@ vmem_init(const char *heap_name, // kstat.vmem.vmem.bucket_heap.parent_{alloc+free}, and improves with // increasing initial fixed allocation size. - const size_t mib = 1024ULL * 1024ULL; - const size_t gib = 1024ULL * mib; - size_t resv_size = 128ULL * mib; - extern uint64_t real_total_memory; + /* + * Add an initial segment to spl_heap_arena for convenience. + */ - if (real_total_memory >= 4ULL * gib) - resv_size = 256ULL * mib; - if (real_total_memory >= 8ULL * gib) - resv_size = 512ULL * mib; - if (real_total_memory >= 16ULL * gib) - resv_size = gib; + const size_t mib = 1024ULL * 1024ULL; + const size_t resv_size = 128ULL * mib; dprintf("SPL: %s adding fixed allocation of %llu to the bucket_heap\n", __func__, (uint64_t)resv_size); @@ -3465,6 +3325,7 @@ vmem_init(const char *heap_name, VERIFY(spl_heap_arena_initial_alloc != NULL); + /* remember size we allocated */ spl_heap_arena_initial_alloc_size = resv_size; // kstat.vmem.vmem.heap : kmem_cache_alloc() and similar calls @@ -3484,7 +3345,12 @@ vmem_init(const char *heap_name, vmem_metadata_arena = vmem_create("vmem_metadata", // id 17 NULL, 0, heap_quantum, vmem_alloc_impl, vmem_free_impl, spl_default_arena, - 8 * PAGESIZE, VM_SLEEP | VMC_POPULATOR | VMC_NO_QCACHE); +#ifdef __arm64__ + 2 * PAGESIZE, +#else + 8 * PAGESIZE, +#endif + VM_SLEEP | VMC_POPULATOR | VMC_NO_QCACHE); VERIFY(vmem_metadata_arena != NULL); @@ -3547,13 +3413,11 @@ static void vmem_fini_freelist(void *vmp, void *start, size_t size) void vmem_free_span_list(void) { - int total = 0; - int total_count = 0; + int total __maybe_unused = 0; struct free_slab *fs; // int release = 1; while ((fs = list_head(&freelist))) { - total_count++; total += fs->slabsize; list_remove(&freelist, fs); /* @@ -3753,6 +3617,9 @@ vmem_fini(vmem_t *heap) dprintf("SPL: %s destroying vmem_vmem_arena\n", __func__); vmem_destroy_internal(vmem_vmem_arena); + dprintf("SPL: %s: freeing initial_default_block\n", __func__); + FREE(initial_default_block, M_TEMP); + dprintf("SPL: arenas removed, now try destroying mutexes... "); dprintf("vmem_xnu_alloc_lock "); @@ -3788,8 +3655,15 @@ vmem_fini(vmem_t *heap) // segkmem_free(fs->vmp, fs->slab, fs->slabsize); FREE(fs, M_TEMP); } - dprintf("SPL: WOULD HAVE released %llu bytes (%llu spans) from" - " arenas\n", total, total_count); + if (total != 0 && total_count != 0) { + dprintf("SPL: %s:%d: WOULD HAVE released %llu bytes" + " (%llu spans) from arenas\n", + __func__, __LINE__, total, total_count); + } else { + dprintf("SPL: %s:%d good," + " did not have to force release any vmem spans", + __func__, __LINE__); + } list_destroy(&freelist); dprintf("SPL: %s: Brief delay for readability...\n", __func__); delay(hz); diff --git a/module/os/windows/zfs/abd_os.c b/module/os/windows/zfs/abd_os.c index bdf16dcaf21d..952e8573666d 100644 --- a/module/os/windows/zfs/abd_os.c +++ b/module/os/windows/zfs/abd_os.c @@ -13,6 +13,7 @@ * Copyright (c) 2014 by Chunwei Chen. All rights reserved. * Copyright (c) 2016 by Delphix. All rights reserved. * Copyright (c) 2020 by Jorgen Lundman. All rights reserved. + * Copyright (c) 2021 by Sean Doran. All rights reserved. */ /* @@ -32,7 +33,9 @@ #include #include #include -#include +#ifdef DEBUG +#include +#endif typedef struct abd_stats { kstat_named_t abdstat_struct_size; @@ -87,11 +90,42 @@ struct { * will cause the machine to panic if you change it and try to access the data * within a scattered ABD. */ -size_t zfs_abd_chunk_size = 4096; -lookasidelist_cache_t *abd_chunk_cache; +#if defined(__arm64__) +/* + * On ARM macOS, PAGE_SIZE is not a runtime constant! So here we have to + * guess at compile time. There a balance between fewer kmem_caches, more + * memory use by "tails" of medium-sized ABDs, and more memory use by + * accounting structures if we use 4k versus 16k. + * + * Since the original *subpage* design expected PAGE_SIZE to be constant and + * the pre-subpage ABDs used PAGE_SIZE without requiring it to be a + * compile-time constant, let's use 16k initially and adjust downwards based + * on feedback. + */ +#define ABD_PGSIZE 16384 +#else +#define ABD_PGSIZE PAGE_SIZE +#endif + +const static size_t zfs_abd_chunk_size = ABD_PGSIZE; + +kmem_cache_t *abd_chunk_cache; static kstat_t *abd_ksp; +/* + * Sub-ABD_PGSIZE allocations are segregated into kmem caches. This may be + * inefficient or counterproductive if in future the following conditions are + * not met. + */ +_Static_assert(SPA_MINBLOCKSHIFT == 9, "unexpected SPA_MINSBLOCKSHIFT != 9"); +_Static_assert(ISP2(ABD_PGSIZE), "ABD_PGSIZE unexpectedly non power of 2"); +_Static_assert(ABD_PGSIZE >= 4096, "ABD_PGSIZE unexpectedly smaller than 4096"); +_Static_assert(ABD_PGSIZE <= 16384, + "ABD_PGSIZE unexpectedly larger than 16384"); + +#define SUBPAGE_CACHE_INDICES (ABD_PGSIZE >> SPA_MINBLOCKSHIFT) +kmem_cache_t *abd_subpage_cache[SUBPAGE_CACHE_INDICES] = { NULL }; /* * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose chunks are @@ -105,19 +139,19 @@ static char *abd_zero_buf = NULL; static void abd_free_chunk(void *c) { - lookasidelist_cache_free(abd_chunk_cache, c); + kmem_cache_free(abd_chunk_cache, c); } -static size_t +static inline size_t abd_chunkcnt_for_bytes(size_t size) { return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size); } -static inline size_t +static size_t abd_scatter_chunkcnt(abd_t *abd) { - ASSERT(!abd_is_linear(abd)); + VERIFY(!abd_is_linear(abd)); return (abd_chunkcnt_for_bytes( ABD_SCATTER(abd).abd_offset + abd->abd_size)); } @@ -125,7 +159,7 @@ abd_scatter_chunkcnt(abd_t *abd) boolean_t abd_size_alloc_linear(size_t size) { - return (size <= zfs_abd_chunk_size ? B_TRUE : B_FALSE); + return (B_FALSE); } void @@ -137,12 +171,12 @@ abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) ABDSTAT_BUMP(abdstat_scatter_cnt); ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size); ABDSTAT_INCR(abdstat_scatter_chunk_waste, - n * zfs_abd_chunk_size - abd->abd_size); + n * ABD_SCATTER(abd).abd_chunk_size - abd->abd_size); } else { ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); ABDSTAT_INCR(abdstat_scatter_chunk_waste, - abd->abd_size - n * zfs_abd_chunk_size); + abd->abd_size - n * ABD_SCATTER(abd).abd_chunk_size); } } @@ -169,31 +203,87 @@ abd_verify_scatter(abd_t *abd) VERIFY(!abd_is_linear_page(abd)); VERIFY3U(ABD_SCATTER(abd).abd_offset, <, zfs_abd_chunk_size); + VERIFY3U(ABD_SCATTER(abd).abd_offset, <, + ABD_SCATTER(abd).abd_chunk_size); + VERIFY3U(ABD_SCATTER(abd).abd_chunk_size, >=, + SPA_MINBLOCKSIZE); size_t n = abd_scatter_chunkcnt(abd); + + if (ABD_SCATTER(abd).abd_chunk_size != ABD_PGSIZE) { + VERIFY3U(n, ==, 1); + VERIFY3U(ABD_SCATTER(abd).abd_chunk_size, <, ABD_PGSIZE); + VERIFY3U(abd->abd_size, <=, ABD_SCATTER(abd).abd_chunk_size); + } + for (int i = 0; i < n; i++) { - ASSERT3P( + VERIFY3P( ABD_SCATTER(abd).abd_chunks[i], !=, NULL); } } +static inline int +abd_subpage_cache_index(const size_t size) +{ + const int idx = size >> SPA_MINBLOCKSHIFT; + + if ((size % SPA_MINBLOCKSIZE) == 0) + return (idx - 1); + else + return (idx); +} + +static inline uint_t +abd_subpage_enclosing_size(const int i) +{ + return (SPA_MINBLOCKSIZE * (i + 1)); +} + void abd_alloc_chunks(abd_t *abd, size_t size) { - size_t n = abd_chunkcnt_for_bytes(size); - for (int i = 0; i < n; i++) { - void *c = lookasidelist_cache_alloc(abd_chunk_cache); - ABD_SCATTER(abd).abd_chunks[i] = c; + VERIFY3U(size, >, 0); + if (size <= (zfs_abd_chunk_size - SPA_MINBLOCKSIZE)) { + const int i = abd_subpage_cache_index(size); + VERIFY3S(i, >=, 0); + VERIFY3S(i, <, SUBPAGE_CACHE_INDICES); + const uint_t s = abd_subpage_enclosing_size(i); + VERIFY3U(s, >=, size); + VERIFY3U(s, <, zfs_abd_chunk_size); + void *c = kmem_cache_alloc(abd_subpage_cache[i], KM_SLEEP); + ABD_SCATTER(abd).abd_chunks[0] = c; + ABD_SCATTER(abd).abd_chunk_size = s; + } else { + const size_t n = abd_chunkcnt_for_bytes(size); + + for (int i = 0; i < n; i++) { + void *c = kmem_cache_alloc(abd_chunk_cache, KM_SLEEP); + ABD_SCATTER(abd).abd_chunks[i] = c; + } + ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size; } - ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size; } void abd_free_chunks(abd_t *abd) { - size_t n = abd_scatter_chunkcnt(abd); - for (int i = 0; i < n; i++) { - abd_free_chunk(ABD_SCATTER(abd).abd_chunks[i]); + const uint_t abd_cs = ABD_SCATTER(abd).abd_chunk_size; + + if (abd_cs <= (zfs_abd_chunk_size - SPA_MINBLOCKSIZE)) { + VERIFY3U(abd->abd_size, <, zfs_abd_chunk_size); + VERIFY0(P2PHASE(abd_cs, SPA_MINBLOCKSIZE)); + + const int idx = abd_subpage_cache_index(abd_cs); + VERIFY3S(idx, >=, 0); + VERIFY3S(idx, <, SUBPAGE_CACHE_INDICES); + + kmem_cache_free(abd_subpage_cache[idx], + ABD_SCATTER(abd).abd_chunks[0]); + } else { + const size_t n = abd_scatter_chunkcnt(abd); + for (int i = 0; i < n; i++) { + abd_free_chunk(ABD_SCATTER(abd).abd_chunks[i]); + } } } @@ -236,7 +326,7 @@ static void abd_alloc_zero_scatter(void) { size_t n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); - abd_zero_buf = lookasidelist_cache_alloc(abd_chunk_cache); + abd_zero_buf = kmem_cache_alloc(abd_chunk_cache, KM_SLEEP); memset(abd_zero_buf, 0, zfs_abd_chunk_size); abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); @@ -264,28 +354,142 @@ abd_free_zero_scatter(void) abd_free_struct(abd_zero_scatter); abd_zero_scatter = NULL; - lookasidelist_cache_free(abd_chunk_cache, abd_zero_buf); + kmem_cache_free(abd_chunk_cache, abd_zero_buf); +} + +static int +abd_kstats_update(kstat_t *ksp, int rw) +{ + abd_stats_t *as = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + as->abdstat_struct_size.value.ui64 = + wmsum_value(&abd_sums.abdstat_struct_size); + as->abdstat_scatter_cnt.value.ui64 = + wmsum_value(&abd_sums.abdstat_scatter_cnt); + as->abdstat_scatter_data_size.value.ui64 = + wmsum_value(&abd_sums.abdstat_scatter_data_size); + as->abdstat_scatter_chunk_waste.value.ui64 = + wmsum_value(&abd_sums.abdstat_scatter_chunk_waste); + as->abdstat_linear_cnt.value.ui64 = + wmsum_value(&abd_sums.abdstat_linear_cnt); + as->abdstat_linear_data_size.value.ui64 = + wmsum_value(&abd_sums.abdstat_linear_data_size); + return (0); } void abd_init(void) { - abd_chunk_cache = lookasidelist_cache_create("abd_chunk", - zfs_abd_chunk_size); + /* check if we guessed ABD_PGSIZE correctly */ + ASSERT3U(ABD_PGSIZE, ==, PAGE_SIZE); + +#ifdef DEBUG + /* + * KMF_BUFTAG | KMF_LITE on the abd kmem_caches causes them to waste + * up to 50% of their memory for redzone. Even in DEBUG builds this + * therefore should be KMC_NOTOUCH unless there are concerns about + * overruns, UAFs, etc involving abd chunks or subpage chunks. + * + * Additionally these KMF_ + * flags require the definitions from + */ + + /* + * DEBUGGING: do this + * const int cflags = KMF_BUFTAG | KMF_LITE; + * or + * const int cflags = KMC_ARENA_SLAB; + */ + + int cflags = KMC_ARENA_SLAB; +#else + int cflags = KMC_ARENA_SLAB; +#endif + +#ifdef _KERNEL +/* This must all match spl-seg_kmem.c : segkmem_abd_init() */ +#define SMALL_RAM_MACHINE (4ULL * 1024ULL * 1024ULL * 1024ULL) + + extern uint64_t total_memory; + + if (total_memory < SMALL_RAM_MACHINE) { + cflags = KMC_NOTOUCH; + } +#endif + + abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, + ABD_PGSIZE, + NULL, NULL, NULL, NULL, abd_arena, cflags); + + wmsum_init(&abd_sums.abdstat_struct_size, 0); + wmsum_init(&abd_sums.abdstat_scatter_cnt, 0); + wmsum_init(&abd_sums.abdstat_scatter_data_size, 0); + wmsum_init(&abd_sums.abdstat_scatter_chunk_waste, 0); + wmsum_init(&abd_sums.abdstat_linear_cnt, 0); + wmsum_init(&abd_sums.abdstat_linear_data_size, 0); abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (abd_ksp != NULL) { abd_ksp->ks_data = &abd_stats; + abd_ksp->ks_update = abd_kstats_update; kstat_install(abd_ksp); } abd_alloc_zero_scatter(); + + /* + * Check at compile time that SPA_MINBLOCKSIZE is 512, because we want + * to build sub-page-size linear ABD kmem caches at multiples of + * SPA_MINBLOCKSIZE. If SPA_MINBLOCKSIZE ever changes, a different + * layout should be calculated at runtime. + * + * See also the assertions above the definition of abd_subpbage_cache. + */ + + _Static_assert(SPA_MINBLOCKSIZE == 512, + "unexpected SPA_MINBLOCKSIZE != 512"); + + const int step_size = SPA_MINBLOCKSIZE; + for (int bytes = step_size; bytes < ABD_PGSIZE; bytes += step_size) { + char name[36]; + + (void) snprintf(name, sizeof (name), + "abd_subpage_%lu", (ulong_t)bytes); + + const int index = (bytes >> SPA_MINBLOCKSHIFT) - 1; + VERIFY3S(index, >=, 0); + VERIFY3S(index, <, SUBPAGE_CACHE_INDICES); + +#ifdef DEBUG + int csubflags = KMF_LITE; +#else + int csubflags = 0; +#endif +#ifdef _KERNEL + if (total_memory < SMALL_RAM_MACHINE) + csubflags = cflags; +#endif + abd_subpage_cache[index] = + kmem_cache_create(name, bytes, sizeof (void *), + NULL, NULL, NULL, NULL, abd_subpage_arena, csubflags); + + VERIFY3P(abd_subpage_cache[index], !=, NULL); + } } void abd_fini(void) { + const int step_size = SPA_MINBLOCKSIZE; + for (int bytes = step_size; bytes < ABD_PGSIZE; bytes += step_size) { + const int index = (bytes >> SPA_MINBLOCKSHIFT) - 1; + kmem_cache_destroy(abd_subpage_cache[index]); + abd_subpage_cache[index] = NULL; + } + abd_free_zero_scatter(); if (abd_ksp != NULL) { @@ -293,7 +497,14 @@ abd_fini(void) abd_ksp = NULL; } - lookasidelist_cache_destroy(abd_chunk_cache); + wmsum_fini(&abd_sums.abdstat_struct_size); + wmsum_fini(&abd_sums.abdstat_scatter_cnt); + wmsum_fini(&abd_sums.abdstat_scatter_data_size); + wmsum_fini(&abd_sums.abdstat_scatter_chunk_waste); + wmsum_fini(&abd_sums.abdstat_linear_cnt); + wmsum_fini(&abd_sums.abdstat_linear_data_size); + + kmem_cache_destroy(abd_chunk_cache); abd_chunk_cache = NULL; } @@ -323,27 +534,64 @@ abd_alloc_for_io(size_t size, boolean_t is_metadata) return (abd_alloc_linear(size, is_metadata)); } + +/* + * return an ABD structure that peers into source ABD sabd. The returned ABD + * may be new, or the one supplied as abd. abd and sabd must point to one or + * more zfs_abd_chunk_size (ABD_PGSIZE) chunks, or point to one and exactly one + * smaller chunk. + * + * The [off, off+size] range must be found within (and thus + * fit within) the source ABD. + */ + abd_t * abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, size_t size) { abd_verify(sabd); VERIFY3U(off, <=, sabd->abd_size); - size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; + const uint_t sabd_chunksz = ABD_SCATTER(sabd).abd_chunk_size; + + const size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; + + /* subpage ABD range checking */ + if (sabd_chunksz != zfs_abd_chunk_size) { + /* off+size must fit in 1 chunk */ + VERIFY3U(off + size, <=, sabd_chunksz); + /* new_offset must be in bounds of 1 chunk */ + VERIFY3U(new_offset, <=, sabd_chunksz); + /* new_offset + size must be in bounds of 1 chunk */ + VERIFY3U(new_offset + size, <=, sabd_chunksz); + } /* * chunkcnt is abd_chunkcnt_for_bytes(size), which rounds * up to the nearest chunk, but we also must take care * of the offset *in the leading chunk* */ - size_t chunkcnt = abd_chunkcnt_for_bytes( - (new_offset % zfs_abd_chunk_size) + size); + const size_t chunkcnt = (sabd_chunksz != zfs_abd_chunk_size) + ? 1 + : abd_chunkcnt_for_bytes((new_offset % sabd_chunksz) + size); + /* sanity checks on chunkcnt */ VERIFY3U(chunkcnt, <=, abd_scatter_chunkcnt(sabd)); + VERIFY3U(chunkcnt, >, 0); + + /* non-subpage sanity checking */ + if (chunkcnt > 1) { + /* compare with legacy calculation of chunkcnt */ + VERIFY3U(chunkcnt, ==, abd_chunkcnt_for_bytes( + P2PHASE(new_offset, zfs_abd_chunk_size) + size)); + /* EITHER subpage chunk (singular) or std chunks */ + VERIFY3U(sabd_chunksz, ==, zfs_abd_chunk_size); + } /* - * If an abd struct is provided, it is only the minimum size. If we - * need additional chunks, we need to allocate a new struct. + * If an abd struct is provided, it is only the minimum size (and + * almost certainly provided as an abd_t embedded in a larger + * structure). If we need additional chunks, we need to allocate a + * new struct. */ if (abd != NULL && offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]) > @@ -352,7 +600,7 @@ abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, size_t size) } if (abd == NULL) - abd = abd_alloc_struct(chunkcnt * zfs_abd_chunk_size); + abd = abd_alloc_struct(chunkcnt * sabd_chunksz); /* * Even if this buf is filesystem metadata, we only track that @@ -360,13 +608,24 @@ abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, size_t size) * this case. Therefore, we don't ever use ABD_FLAG_META here. */ - ABD_SCATTER(abd).abd_offset = new_offset % zfs_abd_chunk_size; - ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size; + /* update offset, and sanity check it */ + ABD_SCATTER(abd).abd_offset = new_offset % sabd_chunksz; + + VERIFY3U(ABD_SCATTER(abd).abd_offset, <, sabd_chunksz); + VERIFY3U(ABD_SCATTER(abd).abd_offset + size, <=, + chunkcnt * sabd_chunksz); + + ABD_SCATTER(abd).abd_chunk_size = sabd_chunksz; + + if (chunkcnt > 1) { + VERIFY3U(ABD_SCATTER(sabd).abd_chunk_size, ==, + zfs_abd_chunk_size); + } /* Copy the scatterlist starting at the correct offset */ (void) memcpy(&ABD_SCATTER(abd).abd_chunks, &ABD_SCATTER(sabd).abd_chunks[new_offset / - zfs_abd_chunk_size], + sabd_chunksz], chunkcnt * sizeof (void *)); return (abd); @@ -377,15 +636,16 @@ abd_iter_scatter_chunk_offset(struct abd_iter *aiter) { ASSERT(!abd_is_linear(aiter->iter_abd)); return ((ABD_SCATTER(aiter->iter_abd).abd_offset + - aiter->iter_pos) % zfs_abd_chunk_size); + aiter->iter_pos) % + ABD_SCATTER(aiter->iter_abd).abd_chunk_size); } static inline size_t abd_iter_scatter_chunk_index(struct abd_iter *aiter) { ASSERT(!abd_is_linear(aiter->iter_abd)); - return ((ABD_SCATTER(aiter->iter_abd).abd_offset + - aiter->iter_pos) / zfs_abd_chunk_size); + return ((ABD_SCATTER(aiter->iter_abd).abd_offset + aiter->iter_pos) + / ABD_SCATTER(aiter->iter_abd).abd_chunk_size); } /* @@ -443,9 +703,30 @@ abd_iter_map(struct abd_iter *aiter) ASSERT3P(aiter->iter_mapaddr, ==, NULL); ASSERT0(aiter->iter_mapsize); +#if 0 /* Panic if someone has changed zfs_abd_chunk_size */ + IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size == ABD_SCATTER(aiter->iter_abd).abd_chunk_size); +#else + /* + * If scattered, VERIFY that we are using ABD_PGSIZE chunks, or we have + * one and only one chunk of less than ABD_PGSIZE. + */ + + if (!abd_is_linear(aiter->iter_abd)) { + if (ABD_SCATTER(aiter->iter_abd).abd_chunk_size != + zfs_abd_chunk_size) { + VERIFY3U( + ABD_SCATTER(aiter->iter_abd).abd_chunk_size, + <, zfs_abd_chunk_size); + VERIFY3U(aiter->iter_abd->abd_size, + <, zfs_abd_chunk_size); + VERIFY3U(aiter->iter_abd->abd_size, + <=, ABD_SCATTER(aiter->iter_abd).abd_chunk_size); + } + } +#endif /* There's nothing left to iterate over, so do nothing */ if (abd_iter_at_end(aiter)) @@ -457,8 +738,12 @@ abd_iter_map(struct abd_iter *aiter) paddr = ABD_LINEAR_BUF(aiter->iter_abd); } else { size_t index = abd_iter_scatter_chunk_index(aiter); + IMPLY(ABD_SCATTER(aiter->iter_abd).abd_chunk_size != ABD_PGSIZE, + index == 0); offset = abd_iter_scatter_chunk_offset(aiter); - aiter->iter_mapsize = MIN(zfs_abd_chunk_size - offset, + aiter->iter_mapsize = MIN( + ABD_SCATTER(aiter->iter_abd).abd_chunk_size + - offset, aiter->iter_abd->abd_size - aiter->iter_pos); paddr = ABD_SCATTER(aiter->iter_abd).abd_chunks[index]; } @@ -472,12 +757,10 @@ abd_iter_map(struct abd_iter *aiter) void abd_iter_unmap(struct abd_iter *aiter) { - /* There's nothing left to unmap, so do nothing */ - if (abd_iter_at_end(aiter)) - return; - - ASSERT3P(aiter->iter_mapaddr, !=, NULL); - ASSERT3U(aiter->iter_mapsize, >, 0); + if (!abd_iter_at_end(aiter)) { + ASSERT3P(aiter->iter_mapaddr, !=, NULL); + ASSERT3U(aiter->iter_mapsize, >, 0); + } aiter->iter_mapaddr = NULL; aiter->iter_mapsize = 0; @@ -486,5 +769,20 @@ abd_iter_unmap(struct abd_iter *aiter) void abd_cache_reap_now(void) { - // do nothing + /* + * This function is called by arc_kmem_reap_soon(), which also invokes + * kmem_cache_reap_now() on several other kmem caches. + * + * kmem_cache_reap_now() now operates on all kmem caches at each + * invocation (ignoring its kmem_cache_t argument except for an ASSERT + * in DEBUG builds) by invoking kmem_reap(). Previously + * kmem_cache_reap_now() would clearing the caches magazine working + * set and starting a reap immediately and without regard to the + * kmem_reaping compare-and-swap flag. + * + * Previously in this function we would call kmem_cache_reap_now() for + * each of the abd_chunk and subpage kmem caches. Now, since this + * function is called after several kmem_cache_reap_now(), it + * can be a noop. + */ } diff --git a/module/os/windows/zfs/arc_os.c b/module/os/windows/zfs/arc_os.c index 65a2ab075f42..e586b87b5581 100644 --- a/module/os/windows/zfs/arc_os.c +++ b/module/os/windows/zfs/arc_os.c @@ -826,3 +826,11 @@ void arc_unregister_hotplug(void) { } + +void +spl_set_arc_no_grow(int i) +{ + arc_no_grow = i; + if (i == B_TRUE) + membar_producer(); /* make it visible to other threads */ +}