diff --git a/include/os/windows/spl/sys/debug.h b/include/os/windows/spl/sys/debug.h index 86a72df1e2bf..541bf0555ddd 100644 --- a/include/os/windows/spl/sys/debug.h +++ b/include/os/windows/spl/sys/debug.h @@ -91,6 +91,11 @@ #endif +// cdefs.h +#ifndef __DECONST +#define __DECONST(type, var) ((type)(uintptr_t)(const void *)(var)) +#endif + extern void _Noreturn panic(const char *fmt, ...); diff --git a/include/os/windows/spl/sys/kmem.h b/include/os/windows/spl/sys/kmem.h index 412db7600ead..07ca684ca65e 100644 --- a/include/os/windows/spl/sys/kmem.h +++ b/include/os/windows/spl/sys/kmem.h @@ -72,7 +72,7 @@ extern uint64_t physmem; void *zfs_kmem_alloc(size_t size, int kmflags); void *zfs_kmem_zalloc(size_t size, int kmflags); -void zfs_kmem_free(void *buf, size_t size); +void zfs_kmem_free(const void *buf, size_t size); void spl_kmem_init(uint64_t); void spl_kmem_thread_init(); @@ -128,7 +128,7 @@ kmem_cache_t *kmem_cache_create(char *name, size_t bufsize, size_t align, void *_private, struct vmem *vmp, int cflags); void kmem_cache_destroy(kmem_cache_t *cache); void *kmem_cache_alloc(kmem_cache_t *cache, int flags); -void kmem_cache_free(kmem_cache_t *cache, void *buf); +void kmem_cache_free(kmem_cache_t *cache, const void *buf); void kmem_cache_free_to_slab(kmem_cache_t *cache, void *buf); void kmem_cache_reap_now(kmem_cache_t *cache); void kmem_depot_ws_zero(kmem_cache_t *cache); diff --git a/include/os/windows/spl/sys/kmem_impl.h b/include/os/windows/spl/sys/kmem_impl.h index 50d32cc74cac..98593d3601ee 100644 --- a/include/os/windows/spl/sys/kmem_impl.h +++ b/include/os/windows/spl/sys/kmem_impl.h @@ -372,6 +372,7 @@ struct kmem_cache { uint64_t cache_bufmax; /* max buffers ever */ uint64_t cache_bufslab; /* buffers free in slab layer */ uint64_t cache_reap; /* cache reaps */ + kmutex_t cache_reap_lock; /* one reap at a time */ uint64_t cache_rescale; /* hash table rescales */ uint64_t cache_lookup_depth; /* hash lookup depth */ uint64_t cache_depot_contention; /* mutex contention count */ @@ -464,7 +465,7 @@ typedef struct kmem_log_header { kmutex_t lh_lock; char *lh_base; uint32_t *lh_free; - uint32_t lh_chunksize; + size_t lh_chunksize; uint32_t lh_nchunks; uint32_t lh_head; uint32_t lh_tail; diff --git a/include/os/windows/spl/sys/seg_kmem.h b/include/os/windows/spl/sys/seg_kmem.h index a4784b2fc0de..728ba1bb426c 100644 --- a/include/os/windows/spl/sys/seg_kmem.h +++ b/include/os/windows/spl/sys/seg_kmem.h @@ -42,6 +42,7 @@ extern "C" { extern uint64_t segkmem_total_allocated; extern vmem_t *abd_arena; +extern vmem_t *abd_subpage_arena; /* * segkmem page vnodes @@ -54,7 +55,7 @@ extern vmem_t *abd_arena; #endif /* __sparc */ void *segkmem_alloc(vmem_t *, size_t, int); -extern void segkmem_free(vmem_t *, void *, size_t); +extern void segkmem_free(vmem_t *, const void *, size_t); extern void kernelheap_init(void); extern void kernelheap_fini(void); extern void *segkmem_zio_alloc(vmem_t *, size_t, int); diff --git a/include/os/windows/spl/sys/vmem.h b/include/os/windows/spl/sys/vmem.h index e24d7c53c729..fb45c027a796 100644 --- a/include/os/windows/spl/sys/vmem.h +++ b/include/os/windows/spl/sys/vmem.h @@ -125,7 +125,7 @@ struct vmem; typedef struct vmem vmem_t; typedef void *(vmem_alloc_t)(vmem_t *, size_t, int); -typedef void (vmem_free_t)(vmem_t *, void *, size_t); +typedef void (vmem_free_t)(vmem_t *, const void *, size_t); /* * Alternate import style; the requested size is passed in a pointer, @@ -151,8 +151,8 @@ extern void vmem_destroy(vmem_t *); extern void *vmem_alloc_impl(vmem_t *, size_t, int); extern void *vmem_xalloc(vmem_t *, size_t, size_t, size_t, size_t, void *, void *, int); -extern void vmem_free_impl(vmem_t *, void *, size_t); -extern void vmem_xfree(vmem_t *, void *, size_t); +extern void vmem_free_impl(vmem_t *, const void *, size_t); +extern void vmem_xfree(vmem_t *, const void *, size_t); extern void *vmem_add(vmem_t *, void *, size_t, int); extern int vmem_contains(vmem_t *, void *, size_t); extern void vmem_walk(vmem_t *, int, void (*)(void *, void *, size_t), diff --git a/include/os/windows/spl/sys/vmem_impl.h b/include/os/windows/spl/sys/vmem_impl.h index fc52a0f3b890..45db4d47b9e5 100644 --- a/include/os/windows/spl/sys/vmem_impl.h +++ b/include/os/windows/spl/sys/vmem_impl.h @@ -114,6 +114,8 @@ typedef struct vmem_kstat { kstat_named_t vk_parent_free; /* called the source free function */ kstat_named_t vk_threads_waiting; /* threads in cv_wait in vmem */ kstat_named_t vk_excess; /* count of retained excess imports */ + kstat_named_t vk_lowest_stack; /* least remaining stack seen */ + kstat_named_t vk_async_stack_calls; /* times allocated off-thread */ } vmem_kstat_t; struct vmem { diff --git a/module/os/windows/spl/spl-kmem.c b/module/os/windows/spl/spl-kmem.c index eaf417d07a07..88dd75832633 100644 --- a/module/os/windows/spl/spl-kmem.c +++ b/module/os/windows/spl/spl-kmem.c @@ -24,7 +24,7 @@ * Copyright (C) 2008 MacZFS * Copyright (C) 2013, 2020 Jorgen Lundman * Copyright (C) 2014 Brendon Humphrey - * Copyright (C) 2017 Sean Doran + * Copyright (C) 2017, 2021, 2023 Sean Doran * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Portions Copyright 2022 Andrew Innes * @@ -71,8 +71,7 @@ const unsigned int spl_vm_page_free_min = 3500; static kcondvar_t spl_free_thread_cv; static kmutex_t spl_free_thread_lock; static boolean_t spl_free_thread_exit; -static volatile _Atomic int64_t spl_free; -int64_t spl_free_delta_ema; +static volatile _Atomic int64_t spl_free = 0; static boolean_t spl_event_thread_exit = FALSE; PKEVENT low_mem_event = NULL; @@ -82,11 +81,33 @@ static volatile _Atomic boolean_t spl_free_fast_pressure = FALSE; static _Atomic bool spl_free_maybe_reap_flag = false; static _Atomic uint64_t spl_free_last_pressure = 0; +uint64_t spl_enforce_memory_caps = 1; +_Atomic uint64_t spl_dynamic_memory_cap = 0; +hrtime_t spl_dynamic_memory_cap_last_downward_adjust = 0; +uint64_t spl_dynamic_memory_cap_skipped = 0; +kmutex_t spl_dynamic_memory_cap_lock; +uint64_t spl_dynamic_memory_cap_reductions = 0; +uint64_t spl_dynamic_memory_cap_hit_floor = 0; +static uint64_t spl_manual_memory_cap = 0; +static uint64_t spl_memory_cap_enforcements = 0; + +extern void spl_set_arc_no_grow(int); + +/* + * variables informed by "pure" mach_vm_pressure interface + * + * osfmk/vm/vm_pageout.c: "We don't need fully + * accurate monitoring anyway..." + * + * but in macOS_pure we do want modifications of these + * variables to be seen by all the other threads + * consistently, and asap (there may be hundreds + * of simultaneous readers, even if few writers!) + */ _Atomic uint32_t spl_vm_pages_reclaimed = 0; _Atomic uint32_t spl_vm_pages_wanted = 0; _Atomic uint32_t spl_vm_pressure_level = 0; - /* * the spl_pressure_level enum only goes to four, * but we want to watch kstat for whether @@ -116,7 +137,7 @@ void read_random(void *buffer, uint_t numbytes); // the kmem module is preparing to unload. static int shutting_down = 0; -// Amount of RAM in machine +// Amount of RAM PAGES in machine uint64_t physmem = 0; // Size in bytes of the memory allocated in seg_kmem @@ -431,8 +452,8 @@ for (_e = &_s[(count) - 1]; _e > _s; _e--) \ struct { hrtime_t kmp_timestamp; /* timestamp of panic */ int kmp_error; /* type of kmem error */ - void *kmp_buffer; /* buffer that induced panic */ - void *kmp_realbuf; /* real start address for buffer */ + const void *kmp_buffer; /* buffer that induced panic */ + const void *kmp_realbuf; /* real start address for buffer */ kmem_cache_t *kmp_cache; /* buffer's cache according to client */ kmem_cache_t *kmp_realcache; /* actual cache containing buffer */ kmem_slab_t *kmp_slab; /* slab accoring to kmem_findslab() */ @@ -440,9 +461,15 @@ struct { } kmem_panic_info; extern uint64_t stat_osif_malloc_success; +extern uint64_t stat_osif_malloc_fail; extern uint64_t stat_osif_malloc_bytes; extern uint64_t stat_osif_free; extern uint64_t stat_osif_free_bytes; +extern uint64_t stat_osif_malloc_sub128k; +extern uint64_t stat_osif_malloc_sub64k; +extern uint64_t stat_osif_malloc_sub32k; +extern uint64_t stat_osif_malloc_page; +extern uint64_t stat_osif_malloc_subpage; extern uint64_t spl_bucket_non_pow2_allocs; @@ -462,20 +489,14 @@ extern uint64_t spl_vmem_conditional_alloc_bytes; extern uint64_t spl_vmem_conditional_alloc_deny; extern uint64_t spl_vmem_conditional_alloc_deny_bytes; -extern uint64_t spl_xat_success; -extern uint64_t spl_xat_late_success; -extern uint64_t spl_xat_late_success_nosleep; extern uint64_t spl_xat_pressured; -extern uint64_t spl_xat_bailed; -extern uint64_t spl_xat_bailed_contended; extern uint64_t spl_xat_lastalloc; extern uint64_t spl_xat_lastfree; -extern uint64_t spl_xat_forced; extern uint64_t spl_xat_sleep; -extern uint64_t spl_xat_late_deny; -extern uint64_t spl_xat_no_waiters; -extern uint64_t spl_xft_wait; +extern uint64_t spl_vba_fastpath; +extern uint64_t spl_vba_fastexit; +extern uint64_t spl_vba_slowpath; extern uint64_t spl_vba_parent_memory_appeared; extern uint64_t spl_vba_parent_memory_blocked; extern uint64_t spl_vba_hiprio_blocked; @@ -507,6 +528,7 @@ uint64_t kmem_free_to_slab_when_fragmented = 0; extern _Atomic uint64_t spl_lowest_vdev_disk_stack_remaining; extern _Atomic uint64_t spl_lowest_zvol_stack_remaining; extern _Atomic uint64_t spl_lowest_alloc_stack_remaining; +extern unsigned int spl_split_stack_below; typedef struct spl_stats { kstat_named_t spl_os_alloc; @@ -518,12 +540,27 @@ typedef struct spl_stats { kstat_named_t spl_spl_free; kstat_named_t spl_spl_free_manual_pressure; kstat_named_t spl_spl_free_fast_pressure; - kstat_named_t spl_spl_free_delta_ema; kstat_named_t spl_spl_free_negative_count; kstat_named_t spl_osif_malloc_success; + kstat_named_t spl_osif_malloc_fail; kstat_named_t spl_osif_malloc_bytes; kstat_named_t spl_osif_free; kstat_named_t spl_osif_free_bytes; + + kstat_named_t spl_enforce_memory_caps; + kstat_named_t spl_dynamic_memory_cap; + kstat_named_t spl_dynamic_memory_cap_skipped; + kstat_named_t spl_dynamic_memory_cap_reductions; + kstat_named_t spl_dynamic_memory_cap_hit_floor; + kstat_named_t spl_manual_memory_cap; + kstat_named_t spl_memory_cap_enforcements; + + kstat_named_t spl_osif_malloc_sub128k; + kstat_named_t spl_osif_malloc_sub64k; + kstat_named_t spl_osif_malloc_sub32k; + kstat_named_t spl_osif_malloc_page; + kstat_named_t spl_osif_malloc_subpage; + kstat_named_t spl_bucket_non_pow2_allocs; kstat_named_t spl_vmem_unconditional_allocs; @@ -533,20 +570,15 @@ typedef struct spl_stats { kstat_named_t spl_vmem_conditional_alloc_deny; kstat_named_t spl_vmem_conditional_alloc_deny_bytes; - kstat_named_t spl_xat_success; - kstat_named_t spl_xat_late_success; - kstat_named_t spl_xat_late_success_nosleep; kstat_named_t spl_xat_pressured; kstat_named_t spl_xat_bailed; - kstat_named_t spl_xat_bailed_contended; kstat_named_t spl_xat_lastalloc; kstat_named_t spl_xat_lastfree; - kstat_named_t spl_xat_forced; kstat_named_t spl_xat_sleep; - kstat_named_t spl_xat_late_deny; - kstat_named_t spl_xat_no_waiters; - kstat_named_t spl_xft_wait; + kstat_named_t spl_vba_fastpath; + kstat_named_t spl_vba_fastexit; + kstat_named_t spl_vba_slowpath; kstat_named_t spl_vba_parent_memory_appeared; kstat_named_t spl_vba_parent_memory_blocked; kstat_named_t spl_vba_hiprio_blocked; @@ -573,10 +605,10 @@ typedef struct spl_stats { kstat_named_t spl_vm_pages_reclaimed; kstat_named_t spl_vm_pages_wanted; kstat_named_t spl_vm_pressure_level; - kstat_named_t spl_lowest_alloc_stack_remaining; kstat_named_t spl_lowest_vdev_disk_stack_remaining; kstat_named_t spl_lowest_zvol_stack_remaining; + kstat_named_t spl_split_stack_below; } spl_stats_t; static spl_stats_t spl_stats = { @@ -589,12 +621,27 @@ static spl_stats_t spl_stats = { {"spl_spl_free", KSTAT_DATA_INT64}, {"spl_spl_free_manual_pressure", KSTAT_DATA_UINT64}, {"spl_spl_free_fast_pressure", KSTAT_DATA_UINT64}, - {"spl_spl_free_delta_ema", KSTAT_DATA_UINT64}, {"spl_spl_free_negative_count", KSTAT_DATA_UINT64}, {"spl_osif_malloc_success", KSTAT_DATA_UINT64}, + {"spl_osif_malloc_fail", KSTAT_DATA_UINT64}, {"spl_osif_malloc_bytes", KSTAT_DATA_UINT64}, {"spl_osif_free", KSTAT_DATA_UINT64}, {"spl_osif_free_bytes", KSTAT_DATA_UINT64}, + + {"spl_osif_enforce_memory_caps", KSTAT_DATA_UINT64}, + {"spl_osif_dynamic_memory_cap", KSTAT_DATA_UINT64}, + {"spl_osif_dynamic_memory_cap_skipped", KSTAT_DATA_UINT64}, + {"spl_osif_dynamic_memory_cap_reductions", KSTAT_DATA_UINT64}, + {"spl_osif_dynamic_memory_cap_hit_floor", KSTAT_DATA_UINT64}, + {"spl_osif_manual_memory_cap", KSTAT_DATA_UINT64}, + {"spl_osif_memory_cap_enforcements", KSTAT_DATA_UINT64}, + + {"spl_osif_malloc_sub128k", KSTAT_DATA_UINT64}, + {"spl_osif_malloc_sub64k", KSTAT_DATA_UINT64}, + {"spl_osif_malloc_sub32k", KSTAT_DATA_UINT64}, + {"spl_osif_malloc_page", KSTAT_DATA_UINT64}, + {"spl_osif_malloc_subpage", KSTAT_DATA_UINT64}, + {"spl_bucket_non_pow2_allocs", KSTAT_DATA_UINT64}, {"vmem_unconditional_allocs", KSTAT_DATA_UINT64}, @@ -604,20 +651,14 @@ static spl_stats_t spl_stats = { {"vmem_conditional_alloc_deny", KSTAT_DATA_UINT64}, {"vmem_conditional_alloc_deny_bytes", KSTAT_DATA_UINT64}, - {"spl_xat_success", KSTAT_DATA_UINT64}, - {"spl_xat_late_success", KSTAT_DATA_UINT64}, - {"spl_xat_late_success_nosleep", KSTAT_DATA_UINT64}, {"spl_xat_pressured", KSTAT_DATA_UINT64}, - {"spl_xat_bailed", KSTAT_DATA_UINT64}, - {"spl_xat_bailed_contended", KSTAT_DATA_UINT64}, {"spl_xat_lastalloc", KSTAT_DATA_UINT64}, {"spl_xat_lastfree", KSTAT_DATA_UINT64}, - {"spl_xat_forced", KSTAT_DATA_UINT64}, {"spl_xat_sleep", KSTAT_DATA_UINT64}, - {"spl_xat_late_deny", KSTAT_DATA_UINT64}, - {"spl_xat_no_waiters", KSTAT_DATA_UINT64}, - {"spl_xft_wait", KSTAT_DATA_UINT64}, + {"spl_vba_fastpath", KSTAT_DATA_UINT64}, + {"spl_vba_fastexit", KSTAT_DATA_UINT64}, + {"spl_vba_slowpath", KSTAT_DATA_UINT64}, {"spl_vba_parent_memory_appeared", KSTAT_DATA_UINT64}, {"spl_vba_parent_memory_blocked", KSTAT_DATA_UINT64}, {"spl_vba_hiprio_blocked", KSTAT_DATA_UINT64}, @@ -644,11 +685,10 @@ static spl_stats_t spl_stats = { {"spl_vm_pages_reclaimed", KSTAT_DATA_UINT64}, {"spl_vm_pages_wanted", KSTAT_DATA_UINT64}, {"spl_vm_pressure_level", KSTAT_DATA_UINT64}, - {"lowest_alloc_stack_remaining", KSTAT_DATA_UINT64}, {"lowest_vdev_disk_stack_remaining", KSTAT_DATA_UINT64}, {"lowest_zvol_stack_remaining", KSTAT_DATA_UINT64}, - + {"split_stack_below", KSTAT_DATA_UINT64}, }; static kstat_t *spl_ksp = 0; @@ -721,11 +761,11 @@ copy_pattern(uint64_t pattern, void *buf_arg, size_t size) *buf++ = pattern; } -static void * -verify_pattern(uint64_t pattern, void *buf_arg, size_t size) +static const void * +verify_pattern(uint64_t pattern, const void *buf_arg, size_t size) { - uint64_t *bufend = (uint64_t *)((char *)buf_arg + size); - uint64_t *buf; + const uint64_t *bufend = (const uint64_t *)((char *)buf_arg + size); + const uint64_t *buf; for (buf = buf_arg; buf < bufend; buf++) if (*buf != pattern) @@ -790,7 +830,7 @@ kmem_cache_applyall_id(void (*func)(kmem_cache_t *), taskq_t *tq, int tqflag) * Debugging support. Given a buffer address, find its slab. */ static kmem_slab_t * -kmem_findslab(kmem_cache_t *cp, void *buf) +kmem_findslab(kmem_cache_t *cp, const void *buf) { kmem_slab_t *sp; @@ -815,14 +855,14 @@ kmem_findslab(kmem_cache_t *cp, void *buf) } static void -kmem_error(int error, kmem_cache_t *cparg, void *bufarg) +kmem_error(int error, kmem_cache_t *cparg, const void *bufarg) { kmem_buftag_t *btp = NULL; kmem_bufctl_t *bcp = NULL; kmem_cache_t *cp = cparg; kmem_slab_t *sp; - uint64_t *off; - void *buf = bufarg; + const uint64_t *off; + const void *buf = bufarg; kmem_logging = 0; /* stop logging when a bad thing happens */ @@ -883,10 +923,15 @@ kmem_error(int error, kmem_cache_t *cparg, void *bufarg) case KMERR_MODIFIED: TraceEvent(TRACE_ERROR, "buffer modified after being" " freed\n"); + dprintf("buffer modified after being freed\n"); off = verify_pattern(KMEM_FREE_PATTERN, buf, cp->cache_verify); if (off == NULL) /* shouldn't happen */ off = buf; + dprintf("SPL: modification occurred at offset 0x%lx " + "(0x%llx replaced by 0x%llx)\n", + (uintptr_t)off - (uintptr_t)buf, + (longlong_t)KMEM_FREE_PATTERN, (longlong_t)*off); TraceEvent(TRACE_ERROR, "SPL: modification occurred " "at offset 0x%lx (0x%llx replaced by 0x%llx)\n", (uintptr_t)off - (uintptr_t)buf, @@ -894,21 +939,28 @@ kmem_error(int error, kmem_cache_t *cparg, void *bufarg) break; case KMERR_REDZONE: + dprintf("redzone violation: write past end of buf\n"); TraceEvent(TRACE_ERROR, "redzone violation: write past" " end of buffer\n"); break; case KMERR_BADADDR: + dprintf("invalid free: buffer not in cache\n"); TraceEvent(TRACE_ERROR, "invalid free: buffer not in" " cache\n"); break; case KMERR_DUPFREE: + dprintf("duplicate free: buffer freed twice\n"); TraceEvent(TRACE_ERROR, "duplicate free: buffer freed" " twice\n"); break; case KMERR_BADBUFTAG: + dprintf("boundary tag corrupted\n"); + dprintf("SPL: bcp ^ bxstat = %lx, should be %lx\n", + (intptr_t)btp->bt_bufctl ^ btp->bt_bxstat, + KMEM_BUFTAG_FREE); TraceEvent(TRACE_ERROR, "boundary tag corrupted\n"); TraceEvent(TRACE_ERROR, "SPL: bcp ^ bxstat = %lx, " "should be %lx\n", @@ -917,10 +969,16 @@ kmem_error(int error, kmem_cache_t *cparg, void *bufarg) break; case KMERR_BADBUFCTL: + dprintf("bufctl corrupted\n"); TraceEvent(TRACE_ERROR, "bufctl corrupted\n"); break; case KMERR_BADCACHE: + dprintf("buffer freed to wrong cache\n"); + dprintf("SPL: buffer was allocated from %s,\n", + cp->cache_name); + dprintf("SPL: caller attempting free to %s.\n", + cparg->cache_name); TraceEvent(TRACE_ERROR, "buffer freed to wrong " "cache\n"); TraceEvent(TRACE_ERROR, "SPL: buffer was allocated" @@ -930,6 +988,9 @@ kmem_error(int error, kmem_cache_t *cparg, void *bufarg) break; case KMERR_BADSIZE: + dprintf("bad free: free size (%u) != alloc size (%u)\n", + KMEM_SIZE_DECODE(((uint32_t *)btp)[0]), + KMEM_SIZE_DECODE(((uint32_t *)btp)[1])); TraceEvent(TRACE_ERROR, "bad free: free size (%u) !=" " alloc size (%u)\n", KMEM_SIZE_DECODE(((uint32_t *)btp)[0]), @@ -937,6 +998,8 @@ kmem_error(int error, kmem_cache_t *cparg, void *bufarg) break; case KMERR_BADBASE: + dprintf("bad free: free address (%p) != alloc address" + " (%p)\n", bufarg, buf); TraceEvent(TRACE_ERROR, "bad free: free address" " (%p) != alloc address (%p)\n", bufarg, buf); break; @@ -1376,7 +1439,7 @@ static void kmem_slab_move_yes(kmem_cache_t *, kmem_slab_t *, void *); static void kmem_slab_free(kmem_cache_t *cp, void *buf) { - kmem_slab_t *sp; + kmem_slab_t *sp = NULL; kmem_bufctl_t *bcp, **prev_bcpp; ASSERT(buf != NULL); @@ -1754,12 +1817,24 @@ kmem_depot_ws_zero(kmem_cache_t *cp) } /* - * The number of bytes to reap before we call kpreempt(). The default (1MB) - * causes us to preempt reaping up to hundres of times per second. Using a - * larger value (1GB) causes this to have virtually no effect. + * The number of bytes to reap before we call kpreempt(). + * + * There is a tradeoff between potentially many many preempts when giving + * freeing a large amount of ARC scatter ABDs (the preempts slightly slow down + * the return of memory to parent arenas during a larger reap, which in turn + * slightly delays the return of memory to the operating system) versus + * letting other threads on low-core-count machines make forward progress + * (which was upstream's goal when reap preemption was first introduced) or + * (in more modern times) gaining efficiencies in busy high-core-count + * machines that can have many threads allocating while an inevitably + * long-lived reap is in progress, narrowing the possibility of destroying + * kmem structures that might have to be rebuilt during the next preemption. + * + * Historically 1M was the value from upstream, which was increased for o3x + * for performance reasons. The reap mechanisms have evolved such that 1M + * is once again the better default. */ -size_t kmem_reap_preempt_bytes = 64 * 1024 * 1024; - +size_t kmem_reap_preempt_bytes = 1024 * 1024; /* * Reap all magazines that have fallen out of the depot's working set. @@ -1774,6 +1849,22 @@ kmem_depot_ws_reap(kmem_cache_t *cp) ASSERT(!list_link_active(&cp->cache_link) || taskq_member(kmem_taskq, curthread)); + bool mtx_contended = false; + + if (!mutex_tryenter(&cp->cache_reap_lock)) { + mtx_contended = true; + dprintf("ZFS: SPL: %s:%s:%d: could not get lock\n", + __FILE__, __func__, __LINE__); + IOSleep(1); + mutex_enter(&cp->cache_reap_lock); + } + + if (mtx_contended) + dprintf("ZFS: SPL: %s:%s:%d: reap mutex for %s " + "was contended\n", + __FILE__, __func__, __LINE__, + cp->cache_name); + reap = MIN(cp->cache_full.ml_reaplimit, cp->cache_full.ml_min); while (reap-- && (mp = kmem_depot_alloc(cp, &cp->cache_full)) != NULL) { @@ -1795,6 +1886,8 @@ kmem_depot_ws_reap(kmem_cache_t *cp) bytes = 0; } } + + mutex_exit(&cp->cache_reap_lock); } static void @@ -1807,7 +1900,7 @@ kmem_cpu_reload(kmem_cpu_cache_t *ccp, kmem_magazine_t *mp, int rounds) ccp->cc_ploaded = ccp->cc_loaded; ccp->cc_prounds = ccp->cc_rounds; ccp->cc_loaded = mp; - ccp->cc_rounds = rounds; + ccp->cc_rounds = (short)rounds; } /* @@ -1951,7 +2044,6 @@ kmem_dump_finish(char *buf, size_t size) int kdi_end = kmem_dump_log_idx; int percent = 0; int header = 0; - int warn = 0; size_t used; kmem_cache_t *cp; kmem_dump_log_t *kdl; @@ -1969,7 +2061,7 @@ kmem_dump_finish(char *buf, size_t size) kmem_dumppr(&p, e, "heap size,%ld\n", kmem_dump_size); kmem_dumppr(&p, e, "Oversize allocs,%d\n", kmem_dump_oversize_allocs); - kmem_dumppr(&p, e, "Oversize max size,%ld\n", + kmem_dumppr(&p, e, "Oversize max size,%u\n", kmem_dump_oversize_max); for (kdi_idx = 0; kdi_idx < kdi_end; kdi_idx++) { @@ -1977,8 +2069,6 @@ kmem_dump_finish(char *buf, size_t size) cp = kdl->kdl_cache; if (cp == NULL) break; - if (kdl->kdl_alloc_fails) - ++warn; if (header == 0) { kmem_dumppr(&p, e, "Cache Name,Allocs,Frees,Alloc Fails," @@ -2354,7 +2444,7 @@ kmem_cache_parent_arena_fragmented(kmem_cache_t *cp) * Free a constructed object to cache cp. */ void -kmem_cache_free(kmem_cache_t *cp, void *buf) +kmem_cache_free(kmem_cache_t *cp, const void *buf) { kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp); @@ -2372,7 +2462,8 @@ kmem_cache_free(kmem_cache_t *cp, void *buf) ASSERT(!(ccp->cc_flags & KMF_DUMPDIVERT)); /* log it so that we can warn about it */ KDI_LOG(cp, kdl_unsafe); - } else if (KMEM_DUMPCC(ccp) && !kmem_cache_free_dump(cp, buf)) { + } else if (KMEM_DUMPCC(ccp) && !kmem_cache_free_dump(cp, + __DECONST(void *, buf))) { return; } if (ccp->cc_flags & KMF_BUFTAG) { @@ -2391,7 +2482,8 @@ kmem_cache_free(kmem_cache_t *cp, void *buf) * loaded magazine, just put the object there and return. */ if ((uint_t)ccp->cc_rounds < ccp->cc_magsize) { - ccp->cc_loaded->mag_round[ccp->cc_rounds++] = buf; + ccp->cc_loaded->mag_round[ccp->cc_rounds++] = + __DECONST(void *, buf); ccp->cc_free++; mutex_exit(&ccp->cc_lock); return; @@ -2439,7 +2531,7 @@ kmem_cache_free(kmem_cache_t *cp, void *buf) } mutex_exit(&ccp->cc_lock); kpreempt(KPREEMPT_SYNC); - kmem_slab_free_constructed(cp, buf, B_TRUE); + kmem_slab_free_constructed(cp, __DECONST(void *, buf), B_TRUE); } /* @@ -2652,7 +2744,7 @@ zfs_kmem_alloc(size_t size, int kmflag) } void -zfs_kmem_free(void *buf, size_t size) +zfs_kmem_free(const void *buf, size_t size) { size_t index; kmem_cache_t *cp; @@ -2774,7 +2866,8 @@ kmem_reap_timeout(void *flag_arg) ASSERT(flag == (uint32_t *)&kmem_reaping || flag == (uint32_t *)&kmem_reaping_idspace); - *flag = 0; + __atomic_store_n(flag, 0, __ATOMIC_RELEASE); + ASSERT3U(*flag, ==, 0); } static void @@ -2813,19 +2906,27 @@ kmem_reap_common(void *flag_arg) { uint32_t *flag = (uint32_t *)flag_arg; + ASSERT(flag == &kmem_reaping || flag == &kmem_reaping_idspace); + /* If conditions are met, try to set flag to 1 */ if (MUTEX_HELD(&kmem_cache_lock) || kmem_taskq == NULL || atomic_cas_32(flag, 0, 1) != 0) return; + /* + * If we are here, the appropriate flag is 1. It will be atomically + * zeroed after the reaping has finished and the timeout has expired. + */ /* - * It may not be kosher to do memory allocation when a reap is called + * It may not be safe to do memory allocation when a reap is called * is called (for example, if vmem_populate() is in the call chain). * So we start the reap going with a TQ_NOALLOC dispatch. If the * dispatch fails, we reset the flag, and the next reap will try again. */ - if (!taskq_dispatch(kmem_taskq, kmem_reap_start, flag, TQ_NOALLOC)) - *flag = 0; + if (!taskq_dispatch(kmem_taskq, kmem_reap_start, flag, TQ_NOALLOC)) { + __atomic_store_n(flag, 0, __ATOMIC_RELEASE); + ASSERT3U(*flag, ==, 0); + } } /* @@ -2939,21 +3040,36 @@ kmem_cache_magazine_disable(kmem_cache_t *cp) boolean_t kmem_cache_reap_active(void) { - return (B_FALSE); + return (kmem_reaping.flag); } /* - * Reap (almost) everything right now. + * Fire off a kmem_reap(); that will put a kmem_reap_start() into the taskq if + * conditions are favourable. + * + * This function can be frequently called by common code. Arguably it is + * over-called. + * + * Previously, a kmem_depot_ws_zero(cp) would erase the working set + * information of the kmem cache; it is probably better to let other events + * evolve the magazine working set. + * + * Also previously, a kmem_depot_ws_reap(cp) was dispatched on the kmem taskq. + * This appears to have some unsafeness with respect to concurrency, and this + * unconditional start-a-reap-right-now approach was abandoned by the other + * openzfs ports. On macOS there does not seem to be an advantage in stepping + * around the kmem_reap{,common,start,timeout}() concurrency-controlling + * mechanism (atomic compare-and-swap on kmem_reaping, with an atomic set to + * zero after a delay once the reaping task is done). Moreover, skipping the + * kmem_reaping flag check may have led to double-frees of destroyed depots to + * qcache-equipped vmem arenas. */ void -kmem_cache_reap_now(kmem_cache_t *cp) +kmem_cache_reap_now(kmem_cache_t *cp __maybe_unused) { ASSERT(list_link_active(&cp->cache_link)); - kmem_depot_ws_zero(cp); - - (void) taskq_dispatch(kmem_taskq, - (task_func_t *)kmem_depot_ws_reap, cp, TQ_SLEEP); + kmem_reap(); } /* @@ -3285,7 +3401,7 @@ kmem_cache_stat(kmem_cache_t *cp, char *name) // TRUE if we have more than a critical minimum of memory // used in arc_memory_throttle; if FALSE, we throttle -bool +static bool spl_minimal_physmem_p_logic() { // do we have enough memory to avoid throttling? @@ -3318,13 +3434,7 @@ spl_minimal_physmem_p(void) size_t kmem_maxavail(void) { -#ifndef APPLE - // spgcnt_t pmem = availrmem - tune.t_minarmem; - // spgcnt_t vmem = btop(vmem_size(heap_arena, VMEM_FREE)); - // - // return ((size_t)ptob(MAX(MIN(pmem, vmem), 0))); -#endif - return (physmem * PAGE_SIZE); + return (total_memory); } /* @@ -3624,7 +3734,7 @@ kmem_cache_create( ASSERT(chunksize + sizeof (kmem_slab_t) <= cp->cache_slabsize); ASSERT(!(cp->cache_flags & KMF_AUDIT)); } else { - size_t chunks, bestfit, waste, slabsize; + size_t chunks, bestfit = 0, waste, slabsize; size_t minwaste = LONG_MAX; for (chunks = 1; chunks <= KMEM_VOID_FRACTION; chunks++) { @@ -3670,6 +3780,8 @@ kmem_cache_create( cp->cache_color = cp->cache_mincolor; + mutex_init(&cp->cache_reap_lock, NULL, MUTEX_DEFAULT, NULL); + /* * Initialize the rest of the slab layer. */ @@ -3892,6 +4004,13 @@ kmem_cache_destroy(kmem_cache_t *cp) kmem_cache_magazine_purge(cp); + /* + * make sure there isn't a reaper + * since it would dereference cp + */ + mutex_enter(&cp->cache_reap_lock); + mutex_exit(&cp->cache_reap_lock); + mutex_enter(&cp->cache_lock); if (cp->cache_buftotal != 0) @@ -3934,6 +4053,7 @@ kmem_cache_destroy(kmem_cache_t *cp) mutex_destroy(&cp->cache_depot_lock); mutex_destroy(&cp->cache_lock); + mutex_destroy(&cp->cache_reap_lock); vmem_free_impl(kmem_cache_arena, cp, KMEM_CACHE_SIZE(max_ncpus)); } @@ -4139,7 +4259,15 @@ kmem_cache_init(int pass, int use_large_pages) kmem_big_alloc_table_max = maxbuf >> KMEM_BIG_SHIFT; } +/* + * At kext unload, kmem_cache_build_slablist() builds a list of free slabs + * from all kmem caches, so kmem_cache_fini() can report the leaks and the + * total number of leaks. + */ + struct free_slab { + char vm_name[VMEM_NAMELEN]; + char cache_name[KMEM_CACHE_NAMELEN + 1]; vmem_t *vmp; size_t slabsize; void *slab; @@ -4148,7 +4276,6 @@ struct free_slab { static list_t freelist; - void kmem_cache_build_slablist(kmem_cache_t *cp) { @@ -4163,6 +4290,9 @@ kmem_cache_build_slablist(kmem_cache_t *cp) MALLOC(fs, struct free_slab *, sizeof (struct free_slab), M_TEMP, M_WAITOK); + strlcpy(fs->vm_name, vmp->vm_name, VMEM_NAMELEN); + strlcpy(fs->cache_name, cp->cache_name, + KMEM_CACHE_NAMELEN); fs->vmp = vmp; fs->slabsize = cp->cache_slabsize; fs->slab = (void *)P2ALIGN((uintptr_t)sp->slab_base, @@ -4176,6 +4306,9 @@ kmem_cache_build_slablist(kmem_cache_t *cp) MALLOC(fs, struct free_slab *, sizeof (struct free_slab), M_TEMP, M_WAITOK); + strlcpy(fs->vm_name, vmp->vm_name, VMEM_NAMELEN); + strlcpy(fs->cache_name, cp->cache_name, + KMEM_CACHE_NAMELEN); fs->vmp = vmp; fs->slabsize = cp->cache_slabsize; fs->slab = (void *)P2ALIGN((uintptr_t)sp->slab_base, @@ -4225,19 +4358,132 @@ kmem_cache_fini() i = 0; while ((fs = list_head(&freelist))) { i++; + dprintf("SPL: %s:%d: released %lu from '%s' to '%s'\n", + __func__, __LINE__, + fs->slabsize, + fs->cache_name, + fs->vm_name); list_remove(&freelist, fs); vmem_free_impl(fs->vmp, fs->slab, fs->slabsize); FREE(fs, M_TEMP); } - xprintf("SPL: Released %u slabs\n", i); + dprintf("SPL: %s:%d: Released %u slabs TOTAL\n", + __func__, __LINE__, i); + list_destroy(&freelist); } -// this is intended to substitute for kmem_avail() in arc.c +/* + * Reduce dynamic memory cap by a set amount ("reduction"), unless the cap is + * already 1/8 of total_memory or lower. unlike the logic in + * spl-vmem.c:xnu_alloc_throttled(), we likely have not observed xnu being + * ready to deny us memory, so we drop half the cap half as much. + * + * Inter-thread synchronization of spl_dynamic_memory_cap and spl_free here in + * the next two functions is important as there _will_ be multi-core bursts + * of spl_free_wrapper() calls. + */ +int64_t +spl_reduce_dynamic_cap(void) +{ + /* + * take a snapshot of spl_dynamic_memory_cap, which + * may drop while we are in this function + */ + const uint64_t cap_in = spl_dynamic_memory_cap; + + const uint64_t reduce_amount = total_memory >> 8; + + const int64_t thresh = total_memory >> 3; + + const int64_t reduction = (int64_t)(cap_in - reduce_amount); + + const int64_t reduced = MAX(reduction, thresh); + + /* + * Adjust cap downwards if enough time has elapsed + * for previous adjustments to shrink memory use. + * + * We will still tell ARC to shrink by thresh. + */ + mutex_enter(&spl_dynamic_memory_cap_lock); + + const hrtime_t now = gethrtime(); + if (now > spl_dynamic_memory_cap_last_downward_adjust + + SEC2NSEC(60)) { + + if (spl_dynamic_memory_cap == 0 || + spl_dynamic_memory_cap > total_memory) { + spl_dynamic_memory_cap_last_downward_adjust = now; + spl_dynamic_memory_cap = total_memory - reduce_amount; + atomic_inc_64(&spl_dynamic_memory_cap_reductions); + } else if (spl_dynamic_memory_cap > reduced) { + spl_dynamic_memory_cap_last_downward_adjust = now; + spl_dynamic_memory_cap = reduced; + atomic_inc_64(&spl_dynamic_memory_cap_reductions); + } else if (spl_dynamic_memory_cap <= thresh) { + spl_dynamic_memory_cap_last_downward_adjust = now; + spl_dynamic_memory_cap = thresh; + atomic_inc_64(&spl_dynamic_memory_cap_hit_floor); + } else { + atomic_inc_64(&spl_dynamic_memory_cap_skipped); + } + } else { + atomic_inc_64(&spl_dynamic_memory_cap_skipped); + } + + mutex_exit(&spl_dynamic_memory_cap_lock); + + const uint64_t cap_out = spl_dynamic_memory_cap; + const int64_t cap_diff = cap_out - cap_in; + const int64_t minusthresh = -(int64_t)thresh; + + if (cap_diff > minusthresh) { + spl_free = minusthresh; + return (minusthresh); + } else { + spl_free = cap_diff; + return (cap_diff); + } +} + +/* + * This substitutes for kmem_avail() in arc_os.c + * + * If we believe there is free memory but memory caps are active, enforce on + * them, decrementing the dynamic cap if necessary, returning a non-positive + * free memory to ARC if we have reached either enforced cap. + */ int64_t spl_free_wrapper(void) { +// MEMORYSTATUSEX memInfo; +// memInfo.dwLength = sizeof(MEMORYSTATUSEX); + + if (spl_enforce_memory_caps != 0 && spl_free > 0) { + if (segkmem_total_mem_allocated >= + spl_dynamic_memory_cap) { + atomic_inc_64(&spl_memory_cap_enforcements); + spl_set_arc_no_grow(B_TRUE); + return (spl_reduce_dynamic_cap()); + } else if (spl_manual_memory_cap > 0 && + segkmem_total_mem_allocated >= spl_manual_memory_cap) { + spl_set_arc_no_grow(B_TRUE); + atomic_inc_64(&spl_memory_cap_enforcements); + const int64_t dec = spl_manual_memory_cap - + segkmem_total_mem_allocated; + const int64_t giveback = -(total_memory >> 10); + if (dec > giveback) { + spl_free = giveback; + return (giveback); + } else { + spl_free = dec; + return (dec); + } + } + } + return (spl_free); } @@ -4296,7 +4542,7 @@ spl_free_set_and_wait_pressure(int64_t new_p, boolean_t fast, TraceEvent(TRACE_ERROR, "%s: ERROR: timed out " "after one minute!\n", __func__); break; - } else if (now > double_again_at && !doubled_again) { + } else if (doubled && now > double_again_at && !doubled_again) { doubled_again = true; new_p *= 2; } else if (now > double_at) { @@ -4317,7 +4563,9 @@ spl_free_set_pressure(int64_t new_p) spl_free_fast_pressure = FALSE; // wake up both spl_free_thread() to recalculate spl_free // and any spl_free_set_and_wait_pressure() threads - cv_broadcast(&spl_free_thread_cv); + mutex_enter(&spl_free_thread_lock); + cv_signal(&spl_free_thread_cv); + mutex_exit(&spl_free_thread_lock); } spl_free_last_pressure = zfs_lbolt(); } @@ -4430,11 +4678,6 @@ static void spl_free_thread() { callb_cpr_t cpr; - uint64_t last_update = zfs_lbolt(); - int64_t last_spl_free; - double ema_new = 0; - double ema_old = 0; - double alpha; CALLB_CPR_INIT(&cpr, &spl_free_thread_lock, callb_generic_cpr, FTAG); @@ -4442,6 +4685,9 @@ spl_free_thread() spl_free = MAX(4*1024*1024*1024, total_memory * 75ULL / 100ULL); + if (spl_dynamic_memory_cap == 0) + spl_dynamic_memory_cap = total_memory; + mutex_enter(&spl_free_thread_lock); dprintf("SPL: beginning spl_free_thread() loop, spl_free == %lld\n", @@ -4454,7 +4700,6 @@ spl_free_thread() mutex_exit(&spl_free_thread_lock); boolean_t lowmem = false; boolean_t emergency_lowmem = false; - int64_t base; int64_t new_spl_free = 0LL; spl_stats.spl_free_wake_count.value.ui64++; @@ -4469,8 +4714,6 @@ spl_free_thread() if (time_now > hz) time_now_seconds = time_now / hz; - last_spl_free = spl_free; - new_spl_free = total_memory - segkmem_total_mem_allocated; @@ -4489,7 +4732,11 @@ spl_free_thread() // uint32_t pages_reclaimed = 0; // uint32_t pages_wanted = 0; -/* get pressure here */ + // XNU calls mach_vm_pressure_monitor() which + // fills in pages_reclaimed and pages_wanted. + // then assign them to spl_vm_pages_reclaimed and + // spl_vm_pages_wanted + // Windows event thread will set them for us. if (spl_vm_pressure_level > 0 && spl_vm_pressure_level != MAGIC_PRESSURE_UNAVAILABLE) { @@ -4555,6 +4802,34 @@ spl_free_thread() } } + /* + * Pressure and declare zero free memory if we are above + * memory caps. This is not the hardest enforcement + * mechanism, so see also enforcement in spl_free_wrapper() + */ + if (spl_enforce_memory_caps) { + if (segkmem_total_mem_allocated >= + spl_dynamic_memory_cap) { + lowmem = true; + emergency_lowmem = true; + if (new_spl_free >= 0) + new_spl_free = + spl_dynamic_memory_cap - + segkmem_total_mem_allocated; + atomic_inc_64(&spl_memory_cap_enforcements); + } else if (spl_manual_memory_cap > 0 && + segkmem_total_mem_allocated >= + spl_manual_memory_cap) { + lowmem = true; + emergency_lowmem = true; + if (new_spl_free >= 0) + new_spl_free = + spl_manual_memory_cap - + segkmem_total_mem_allocated; + atomic_inc_64(&spl_memory_cap_enforcements); + } + } + /* * can we allocate at least a 64 MiB segment * from spl_heap_arena? this probes the reserve @@ -4742,8 +5017,6 @@ spl_free_thread() recent_lowmem = 0; } - base = new_spl_free; - // adjust for available memory in spl_heap_arena // cf arc_available_memory() if (!emergency_lowmem) { @@ -4797,8 +5070,6 @@ spl_free_thread() new_spl_free = -1024LL; } - double delta = (double)new_spl_free - (double)last_spl_free; - boolean_t spl_free_is_negative = false; if (new_spl_free < 0LL) { @@ -4817,6 +5088,20 @@ spl_free_thread() new_spl_free = 2LL * spamaxblksz; } + if (spl_enforce_memory_caps != 0) { + if (spl_dynamic_memory_cap != 0) { + const int64_t m = spl_dynamic_memory_cap - + segkmem_total_mem_allocated; + if (new_spl_free > m) + new_spl_free = m; + } else if (spl_manual_memory_cap != 0) { + const int64_t m = spl_manual_memory_cap - + segkmem_total_mem_allocated; + if (new_spl_free > m) + new_spl_free = m; + } + } + // NOW set spl_free from calculated new_spl_free spl_free = new_spl_free; // the direct equivalent of : @@ -4854,18 +5139,6 @@ spl_free_thread() if (lowmem) recent_lowmem = time_now; - // maintain an exponential moving average for the ema kstat - if (last_update > hz) - alpha = 1.0; - else { - double td_tick = (double)(time_now - last_update); - alpha = td_tick / (double)(hz*50.0); // roughly 0.02 - } - - ema_new = (alpha * delta) + (1.0 - alpha)*ema_old; - spl_free_delta_ema = ema_new; - ema_old = ema_new; - justwait: mutex_enter(&spl_free_thread_lock); CALLB_CPR_SAFE_BEGIN(&cpr); @@ -4883,12 +5156,20 @@ spl_free_thread() thread_exit(); } +/* + * Windows specific pressure monitor + * We expect this function to set + * spl_vm_pages_reclaimed + * spl_vm_pages_wanted + * spl_vm_pressure_level + * (kVMPressureNormal=0, Warning=1, Urgent=2, Critical=3) + */ static void spl_event_thread(void *notused) { // callb_cpr_t cpr; NTSTATUS Status; - + LARGE_INTEGER timeout; DECLARE_CONST_UNICODE_STRING(low_mem_name, L"\\KernelObjects\\LowMemoryCondition"); HANDLE low_mem_handle; @@ -4905,24 +5186,44 @@ spl_event_thread(void *notused) dprintf("SPL: beginning spl_event_thread() loop\n"); + timeout.QuadPart = -SEC2NSEC100(30); // 30 seconds. + while (!spl_event_thread_exit) { /* Don't busy loop */ delay(hz); - /* Sleep forever waiting for event */ + /* + * Sleep up to 30s waiting for event, if timeout + * we assume the system is not "low memory". + */ Status = KeWaitForSingleObject(low_mem_event, Executive, - KernelMode, FALSE, NULL); + KernelMode, FALSE, &timeout); KeClearEvent(low_mem_event); - dprintf("%s: LOWMEMORY EVENT *** 0x%x (memusage: %llu)\n", - __func__, Status, segkmem_total_mem_allocated); - /* We were signalled */ - // vm_page_free_wanted = vm_page_free_min; - spl_free_set_pressure(spl_vm_page_free_min); - cv_broadcast(&spl_free_thread_cv); - } + if (Status == STATUS_TIMEOUT) { + + spl_vm_pages_reclaimed = 0; + if (spl_vm_pressure_level > 0) + spl_vm_pressure_level--; + else + spl_vm_pages_wanted = 0; + + } else { + dprintf( + "%s: LOWMEMORY EVENT *** 0x%x (memusage: %llu)\n", + __func__, Status, segkmem_total_mem_allocated); + /* We were signalled */ + // vm_page_free_wanted = vm_page_free_min; + // spl_free_set_pressure(spl_vm_page_free_min); + spl_vm_pages_reclaimed = 0; + spl_vm_pages_wanted += spl_vm_page_free_min; + if (spl_vm_pressure_level < 3) + spl_vm_pressure_level++; + cv_broadcast(&spl_free_thread_cv); + } + } ZwClose(low_mem_handle); spl_event_thread_exit = FALSE; @@ -4977,6 +5278,43 @@ spl_kstat_update(kstat_t *ksp, int rw) ks->kmem_free_to_slab_when_fragmented.value.ui64; } + if ((unsigned int) ks->spl_split_stack_below.value.ui64 != + spl_split_stack_below) { + spl_split_stack_below = + (unsigned int) + ks->spl_split_stack_below.value.ui64; + } + + if (ks->spl_enforce_memory_caps.value.ui64 != + spl_enforce_memory_caps) { + spl_enforce_memory_caps = + ks->spl_enforce_memory_caps.value.ui64; + } + + if (ks->spl_manual_memory_cap.value.ui64 != + spl_manual_memory_cap) { + uint64_t v = + ks->spl_manual_memory_cap.value.ui64; + if (v < total_memory >> 3) + v = total_memory >> 3; + else if (v > total_memory) + v = 0; + spl_manual_memory_cap = v; + } + + if (ks->spl_dynamic_memory_cap.value.ui64 != + spl_dynamic_memory_cap) { + uint64_t v = + ks->spl_dynamic_memory_cap.value.ui64; + if (v == 0) + v = total_memory; + else if (v < total_memory >> 3) + v = total_memory >> 3; + else if (v > total_memory) + v = total_memory; + spl_dynamic_memory_cap = v; + } + } else { ks->spl_os_alloc.value.ui64 = segkmem_total_mem_allocated; ks->spl_active_threads.value.ui64 = zfs_threads; @@ -4988,12 +5326,40 @@ spl_kstat_update(kstat_t *ksp, int rw) spl_free_manual_pressure; ks->spl_spl_free_fast_pressure.value.i64 = spl_free_fast_pressure; - ks->spl_spl_free_delta_ema.value.i64 = spl_free_delta_ema; ks->spl_osif_malloc_success.value.ui64 = stat_osif_malloc_success; + ks->spl_osif_malloc_fail.value.ui64 = + stat_osif_malloc_fail; ks->spl_osif_malloc_bytes.value.ui64 = stat_osif_malloc_bytes; ks->spl_osif_free.value.ui64 = stat_osif_free; ks->spl_osif_free_bytes.value.ui64 = stat_osif_free_bytes; + + ks->spl_enforce_memory_caps.value.ui64 = + spl_enforce_memory_caps; + ks->spl_dynamic_memory_cap.value.ui64 = + spl_dynamic_memory_cap; + ks->spl_dynamic_memory_cap_skipped.value.ui64 = + spl_dynamic_memory_cap_skipped; + ks->spl_dynamic_memory_cap_reductions.value.ui64 = + spl_dynamic_memory_cap_reductions; + ks->spl_dynamic_memory_cap_hit_floor.value.ui64 = + spl_dynamic_memory_cap_hit_floor; + ks->spl_manual_memory_cap.value.ui64 = + spl_manual_memory_cap; + ks->spl_memory_cap_enforcements.value.ui64 = + spl_memory_cap_enforcements; + + ks->spl_osif_malloc_sub128k.value.ui64 = + stat_osif_malloc_sub128k; + ks->spl_osif_malloc_sub64k.value.ui64 = + stat_osif_malloc_sub64k; + ks->spl_osif_malloc_sub32k.value.ui64 = + stat_osif_malloc_sub32k; + ks->spl_osif_malloc_page.value.ui64 = + stat_osif_malloc_page; + ks->spl_osif_malloc_subpage.value.ui64 = + stat_osif_malloc_subpage; + ks->spl_bucket_non_pow2_allocs.value.ui64 = spl_bucket_non_pow2_allocs; @@ -5010,22 +5376,17 @@ spl_kstat_update(kstat_t *ksp, int rw) ks->spl_vmem_conditional_alloc_deny_bytes.value.ui64 = spl_vmem_conditional_alloc_deny_bytes; - ks->spl_xat_success.value.ui64 = spl_xat_success; - ks->spl_xat_late_success.value.ui64 = spl_xat_late_success; - ks->spl_xat_late_success_nosleep.value.ui64 = - spl_xat_late_success_nosleep; ks->spl_xat_pressured.value.ui64 = spl_xat_pressured; - ks->spl_xat_bailed.value.ui64 = spl_xat_bailed; - ks->spl_xat_bailed_contended.value.ui64 = - spl_xat_bailed_contended; ks->spl_xat_lastalloc.value.ui64 = spl_xat_lastalloc; ks->spl_xat_lastfree.value.ui64 = spl_xat_lastfree; - ks->spl_xat_forced.value.ui64 = spl_xat_forced; ks->spl_xat_sleep.value.ui64 = spl_xat_sleep; - ks->spl_xat_late_deny.value.ui64 = spl_xat_late_deny; - ks->spl_xat_no_waiters.value.ui64 = spl_xat_no_waiters; - ks->spl_xft_wait.value.ui64 = spl_xft_wait; + ks->spl_vba_fastpath.value.ui64 = + spl_vba_fastpath; + ks->spl_vba_fastexit.value.ui64 = + spl_vba_fastexit; + ks->spl_vba_slowpath.value.ui64 = + spl_vba_slowpath; ks->spl_vba_parent_memory_appeared.value.ui64 = spl_vba_parent_memory_appeared; ks->spl_vba_parent_memory_blocked.value.ui64 = @@ -5072,6 +5433,8 @@ spl_kstat_update(kstat_t *ksp, int rw) spl_lowest_vdev_disk_stack_remaining; ks->spl_lowest_zvol_stack_remaining.value.ui64 = spl_lowest_zvol_stack_remaining; + ks->spl_split_stack_below.value.ui64 = + spl_split_stack_below; } return (0); @@ -5355,6 +5718,8 @@ spl_kmem_thread_init(void) // Initialize the spl_free locks mutex_init(&spl_free_thread_lock, "spl_free_thead_lock", MUTEX_DEFAULT, NULL); + mutex_init(&spl_dynamic_memory_cap_lock, "spl_dynamic_memory_cap_lock", + MUTEX_DEFAULT, NULL); kmem_taskq = taskq_create("kmem_taskq", 1, minclsyspri, 600, INT_MAX, TASKQ_PREPOPULATE); @@ -5392,6 +5757,8 @@ spl_kmem_thread_fini(void) cv_destroy(&spl_free_thread_cv); mutex_destroy(&spl_free_thread_lock); + mutex_destroy(&spl_dynamic_memory_cap_lock); + bsd_untimeout(kmem_update, &kmem_update_timer); bsd_untimeout(kmem_reap_timeout, &kmem_reaping); bsd_untimeout(kmem_reap_timeout, &kmem_reaping_idspace); @@ -6606,18 +6973,25 @@ kmem_cache_buf_in_cache(kmem_cache_t *cparg, void *bufarg) } if (sp == NULL) { + dprintf("SPL: %s: KMERR_BADADDR orig cache = %s\n", + __func__, cparg->cache_name); TraceEvent(TRACE_ERROR, "SPL: %s: KMERR_BADADDR orig cache =" " %s\n", __func__, cparg->cache_name); return (NULL); } if (cp == NULL) { + dprintf("SPL: %s: ERROR cp == NULL; cparg == %s", + __func__, cparg->cache_name); TraceEvent(TRACE_ERROR, "SPL: %s: ERROR cp == NULL; cparg ==" " %s", __func__, cparg->cache_name); return (NULL); } if (cp != cparg) { + dprintf("SPL: %s: KMERR_BADCACHE arg cache = %s but found " + "in %s instead\n", + __func__, cparg->cache_name, cp->cache_name); TraceEvent(TRACE_ERROR, "SPL: %s: KMERR_BADCACHE arg cache =" " %s but found in %s instead\n", __func__, cparg->cache_name, cp->cache_name); diff --git a/module/os/windows/spl/spl-kstat.c b/module/os/windows/spl/spl-kstat.c index d19115bf91aa..6970cf4a219e 100644 --- a/module/os/windows/spl/spl-kstat.c +++ b/module/os/windows/spl/spl-kstat.c @@ -700,7 +700,7 @@ kstat_free(ekstat_t *e) extern vmem_t *heap_arena; void *segkmem_alloc(vmem_t *vmp, size_t size, int vmflag); -void segkmem_free(vmem_t *vmp, void *inaddr, size_t size); +void segkmem_free(vmem_t *vmp, const void *inaddr, size_t size); /* * Create various system kstats. diff --git a/module/os/windows/spl/spl-seg_kmem.c b/module/os/windows/spl/spl-seg_kmem.c index a992610785db..44ad83133eb7 100644 --- a/module/os/windows/spl/spl-seg_kmem.c +++ b/module/os/windows/spl/spl-seg_kmem.c @@ -88,16 +88,13 @@ #ifdef _KERNEL -#define XNU_KERNEL_PRIVATE - #include - #endif /* _KERNEL */ typedef int page_t; void *segkmem_alloc(vmem_t *vmp, size_t size, int vmflag); -void segkmem_free(vmem_t *vmp, void *inaddr, size_t size); +void segkmem_free(vmem_t *vmp, const void *inaddr, size_t size); /* Total memory held allocated */ uint64_t segkmem_total_mem_allocated = 0; @@ -107,13 +104,21 @@ vmem_t *heap_arena; /* qcaches abd */ vmem_t *abd_arena; +vmem_t *abd_subpage_arena; #ifdef _KERNEL extern uint64_t total_memory; uint64_t stat_osif_malloc_success = 0; +uint64_t stat_osif_malloc_fail = 0; uint64_t stat_osif_free = 0; uint64_t stat_osif_malloc_bytes = 0; uint64_t stat_osif_free_bytes = 0; +uint64_t stat_osif_malloc_sub128k = 0; +uint64_t stat_osif_malloc_sub64k = 0; +uint64_t stat_osif_malloc_sub32k = 0; +uint64_t stat_osif_malloc_page = 0; +uint64_t stat_osif_malloc_subpage = 0; +void spl_free_set_emergency_pressure(int64_t new_p); #endif void * @@ -122,6 +127,17 @@ osif_malloc(uint64_t size) #ifdef _KERNEL void *tr = NULL; + if (size < PAGESIZE) + atomic_inc_64(&stat_osif_malloc_subpage); + else if (size == PAGESIZE) + atomic_inc_64(&stat_osif_malloc_page); + else if (size < 32768) + atomic_inc_64(&stat_osif_malloc_sub32k); + else if (size < 65536) + atomic_inc_64(&stat_osif_malloc_sub64k); + else if (size < 131072) + atomic_inc_64(&stat_osif_malloc_sub128k); + tr = ExAllocatePoolWithTag(NonPagedPoolNx, size, '!SFZ'); ASSERT(P2PHASE(tr, PAGE_SIZE) == 0); if (tr != NULL) { @@ -132,10 +148,13 @@ osif_malloc(uint64_t size) } else { dprintf("%s:%d: ExAllocatePoolWithTag failed (memusage: %llu)" "\n", __func__, __LINE__, segkmem_total_mem_allocated); + extern volatile unsigned int vm_page_free_wanted; extern volatile unsigned int vm_page_free_min; - spl_free_set_pressure(vm_page_free_min); + spl_free_set_emergency_pressure(vm_page_free_min); vm_page_free_wanted = vm_page_free_min; + + atomic_inc_64(&stat_osif_malloc_fail); return (NULL); } #else @@ -144,7 +163,7 @@ osif_malloc(uint64_t size) } void -osif_free(void *buf, uint64_t size) +osif_free(const void *buf, uint64_t size) { #ifdef _KERNEL ExFreePoolWithTag(buf, '!SFZ'); @@ -163,7 +182,13 @@ osif_free(void *buf, uint64_t size) void kernelheap_init() { - heap_arena = vmem_init("heap", NULL, 0, PAGESIZE, segkmem_alloc, + heap_arena = vmem_init("heap", NULL, 0, +#if defined(__arm64__) + 4096, +#else + PAGESIZE, +#endif + segkmem_alloc, segkmem_free); } @@ -181,7 +206,7 @@ segkmem_alloc(vmem_t *vmp, size_t size, int maybe_unmasked_vmflag) } void -segkmem_free(vmem_t *vmp, void *inaddr, size_t size) +segkmem_free(vmem_t *vmp, const void *inaddr, size_t size) { osif_free(inaddr, size); // since this is mainly called by spl_root_arena and free_arena, @@ -230,18 +255,22 @@ segkmem_abd_init() * PAGESIZE is an even multiple of at least several SPA_MINBLOCKSIZE. * This will be _Static_assert-ed in abd_os.c. */ -#if 0 // macos + abd_subpage_arena = vmem_create("abd_subpage_cache", NULL, 0, 512, vmem_alloc_impl, vmem_free_impl, abd_arena, 131072, VM_SLEEP | VMC_NO_QCACHE | VM_FIRSTFIT); VERIFY3P(abd_subpage_arena, !=, NULL); -#endif + } void segkmem_abd_fini(void) { + if (abd_subpage_arena) { + vmem_destroy(abd_subpage_arena); + } + if (abd_arena) { vmem_destroy(abd_arena); } diff --git a/module/os/windows/spl/spl-vmem.c b/module/os/windows/spl/spl-vmem.c index 03e3e702251a..eb7262ae2574 100644 --- a/module/os/windows/spl/spl-vmem.c +++ b/module/os/windows/spl/spl-vmem.c @@ -26,7 +26,7 @@ /* * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. - * Copyright (c) 2017 Sean Doran + * Copyright (c) 2017, 2021, 2023 by Sean Doran */ /* @@ -367,7 +367,9 @@ static vmem_kstat_t vmem_kstat_template = { { "parent_alloc", KSTAT_DATA_UINT64 }, { "parent_free", KSTAT_DATA_UINT64 }, { "threads_waiting", KSTAT_DATA_UINT64 }, - { "excess", KSTAT_DATA_UINT64 }, + { "excess", KSTAT_DATA_UINT64 }, + { "lowest_stack", KSTAT_DATA_UINT64 }, + { "async_stack_calls", KSTAT_DATA_UINT64 }, }; @@ -406,20 +408,14 @@ uint64_t spl_vmem_conditional_alloc_deny = 0; uint64_t spl_vmem_conditional_alloc_deny_bytes = 0; // bucket allocator kstat -uint64_t spl_xat_success = 0; -uint64_t spl_xat_late_success = 0; -uint64_t spl_xat_late_success_nosleep = 0; uint64_t spl_xat_pressured = 0; -uint64_t spl_xat_bailed = 0; -uint64_t spl_xat_bailed_contended = 0; uint64_t spl_xat_lastalloc = 0; uint64_t spl_xat_lastfree = 0; -uint64_t spl_xat_forced = 0; uint64_t spl_xat_sleep = 0; -uint64_t spl_xat_late_deny = 0; -uint64_t spl_xat_no_waiters = 0; -uint64_t spl_xft_wait = 0; +uint64_t spl_vba_fastpath = 0; +uint64_t spl_vba_fastexit = 0; +uint64_t spl_vba_slowpath = 0; uint64_t spl_vba_parent_memory_appeared = 0; uint64_t spl_vba_parent_memory_blocked = 0; uint64_t spl_vba_hiprio_blocked = 0; @@ -430,6 +426,8 @@ uint64_t spl_vba_loop_timeout_blocked = 0; uint64_t spl_vba_sleep = 0; uint64_t spl_vba_loop_entries = 0; +extern uint64_t stat_osif_malloc_fail; + // bucket minimum span size tunables uint64_t spl_bucket_tunable_large_span = 0; uint64_t spl_bucket_tunable_small_span = 0; @@ -451,7 +449,15 @@ extern void spl_free_set_emergency_pressure(int64_t p); extern uint64_t segkmem_total_mem_allocated; extern uint64_t total_memory; -_Atomic uint64_t spl_lowest_alloc_stack_remaining = 0; +extern uint64_t spl_enforce_memory_caps; +extern _Atomic uint64_t spl_dynamic_memory_cap; +extern hrtime_t spl_dynamic_memory_cap_last_downward_adjust; +extern kmutex_t spl_dynamic_memory_cap_lock; +extern uint64_t spl_dynamic_memory_cap_reductions; +extern uint64_t spl_dynamic_memory_cap_hit_floor; + +#define INITIAL_BLOCK_SIZE 16ULL*1024ULL*1024ULL +static char *initial_default_block = NULL; /* * Get a vmem_seg_t from the global segfree list. @@ -626,6 +632,9 @@ vmem_freelist_insert_sort_by_time(vmem_t *vmp, vmem_seg_t *vsp) ASSERT(vsp->vs_span_createtime != 0); if (vsp->vs_span_createtime == 0) { + dprintf("SPL: %s: WARNING: " + "vsp->vs_span_createtime == 0 (%s)!\n", + __func__, vmp->vm_name); TraceEvent(TRACE_WARNING, "SPL: %s: WARNING: " "vsp->vs_span_createtime == 0 (%s)!\n", __func__, vmp->vm_name); @@ -1306,18 +1315,51 @@ spl_vmem_xnu_useful_bytes_free(void) extern _Atomic uint32_t spl_vm_pages_wanted; extern _Atomic uint32_t spl_vm_pressure_level; - if (spl_vm_pages_wanted > 0) - return (PAGE_SIZE * spl_vm_pages_reclaimed); + /* carve out a small reserve for unconditional allocs */ + const uint64_t reserve = total_memory >> 9ULL; + const uint64_t total_minus_reserve = total_memory - reserve; + + /* + * pages are wanted *and* we are in our reserve area, + * so we report only one page of "usable" memory. + * + * if we are below the reserve, return the amount left + */ + + if (spl_vm_pages_wanted > 0) { + if (segkmem_total_mem_allocated >= total_minus_reserve) + return (PAGE_SIZE * MAX(spl_vm_pages_reclaimed, 1)); + else + return (total_minus_reserve - + (segkmem_total_mem_allocated + + PAGE_SIZE * spl_vm_pages_reclaimed)); + } /* + * If there is pressure, and we are in the reserve area, + * then there is no "usable" memory, unless we have reclaimed + * some pages. + * * beware of large magic guard values, - * the pressure enum only goes to 4 + * the pressure enum only goes to 4. */ + if (spl_vm_pressure_level > 0 && - spl_vm_pressure_level < 100) - return (0); + spl_vm_pressure_level < 100) { + if (spl_vm_pages_reclaimed > 0) + return (PAGE_SIZE * spl_vm_pages_reclaimed); + else if (segkmem_total_mem_allocated < total_minus_reserve) + return (PAGE_SIZE); + else + return (0); + } - return (total_memory - segkmem_total_mem_allocated); + /* + * No pressure: return non-reserved bytes not allocated. + * The reserve may be needed for VM_NOWAIT and VM_PANIC flags. + */ + + return (total_minus_reserve - segkmem_total_mem_allocated); } uint64_t @@ -1336,53 +1378,21 @@ spl_vmem_malloc_unconditionally_unlocked(size_t size) return (osif_malloc(size)); } -static void * -spl_vmem_malloc_unconditionally(size_t size) -{ - mutex_enter(&vmem_xnu_alloc_lock); - void *m = spl_vmem_malloc_unconditionally_unlocked(size); - mutex_exit(&vmem_xnu_alloc_lock); - return (m); -} - -static void * -spl_vmem_malloc_if_no_pressure(size_t size) -{ - // The mutex serializes concurrent callers, providing time for - // the variables in spl_vmem_xnu_useful_bytes_free() to be updated. - mutex_enter(&vmem_xnu_alloc_lock); - if (spl_vmem_xnu_useful_bytes_free() > (MAX(size, 1024ULL*1024ULL))) { - extern void *osif_malloc(uint64_t); - void *p = osif_malloc(size); - if (p != NULL) { - spl_vmem_conditional_allocs++; - spl_vmem_conditional_alloc_bytes += size; - } - mutex_exit(&vmem_xnu_alloc_lock); - return (p); - } else { - spl_vmem_conditional_alloc_deny++; - spl_vmem_conditional_alloc_deny_bytes += size; - mutex_exit(&vmem_xnu_alloc_lock); - return (NULL); - } -} - /* * Allocate size bytes at offset phase from an align boundary such that the * resulting segment [addr, addr + size) is a subset of [minaddr, maxaddr) * that does not straddle a nocross-aligned boundary. */ -void * +inline void * vmem_xalloc(vmem_t *vmp, size_t size, size_t align_arg, size_t phase, size_t nocross, void *minaddr, void *maxaddr, int vmflag) { vmem_seg_t *vsp; vmem_seg_t *vbest = NULL; - uintptr_t addr, taddr, start, end; + uintptr_t addr = 0, taddr, start, end; uintptr_t align = (align_arg != 0) ? align_arg : vmp->vm_quantum; void *vaddr, *xvaddr = NULL; - size_t xsize; + size_t xsize = 0; int hb, flist, resv; uint32_t mtbf; @@ -1541,7 +1551,7 @@ vmem_xalloc(vmem_t *vmp, size_t size, size_t align_arg, size_t phase, vmp->vm_nsegfree -= resv; /* reserve our segs */ mutex_exit(&vmp->vm_lock); if (vmp->vm_cflags & VMC_XALLOC) { - size_t oasize = asize; + ASSERTV(size_t oasize = asize); vaddr = ((vmem_ximport_t *) vmp->vm_source_alloc)(vmp->vm_source, &asize, align, vmflag & VM_KMFLAGS); @@ -1689,7 +1699,7 @@ vmem_xalloc(vmem_t *vmp, size_t size, size_t align_arg, size_t phase, * both routines bypass the quantum caches. */ void -vmem_xfree(vmem_t *vmp, void *vaddr, size_t size) +vmem_xfree(vmem_t *vmp, const void *vaddr, size_t size) { vmem_seg_t *vsp, *vnext, *vprev; @@ -1733,7 +1743,8 @@ vmem_xfree(vmem_t *vmp, void *vaddr, size_t size) vmem_span_destroy(vmp, vsp); vmp->vm_kstat.vk_parent_free.value.ui64++; mutex_exit(&vmp->vm_lock); - vmp->vm_source_free(vmp->vm_source, vaddr, size); + vmp->vm_source_free(vmp->vm_source, + __DECONST(void *, vaddr), size); } else { vmem_freelist_insert(vmp, vsp); mutex_exit(&vmp->vm_lock); @@ -1749,6 +1760,18 @@ vmem_xfree(vmem_t *vmp, void *vaddr, size_t size) * instead of the default instant-fit policy. VM_SLEEP allocations are * guaranteed to succeed. */ +/* + * If there is less space on the kernel stack than + * (dynamically tunable) spl_split_stack_below + * then perform the vmem_alloc in the thread_call + * function. Don't set it to 16384, because then it + * continuously triggers, and we hang. + */ +unsigned long spl_split_stack_below = 8192; + +/* kstat tracking the global minimum free stack space */ +_Atomic unsigned int spl_lowest_alloc_stack_remaining = UINT_MAX; + void * vmem_alloc_impl(vmem_t *vmp, size_t size, int vmflag) { @@ -1816,7 +1839,7 @@ vmem_alloc_impl(vmem_t *vmp, size_t size, int vmflag) * Free the segment [vaddr, vaddr + size). */ void -vmem_free_impl(vmem_t *vmp, void *vaddr, size_t size) +vmem_free_impl(vmem_t *vmp, const void *vaddr, size_t size) { if (size - 1 < vmp->vm_qcache_max) kmem_cache_free(vmp->vm_qcache[(size - 1) >> vmp->vm_qshift], @@ -1990,7 +2013,7 @@ spl_vmem_size(vmem_t *vmp, int typemask) static vmem_t * vmem_create_common(const char *name, void *base, size_t size, size_t quantum, void *(*afunc)(vmem_t *, size_t, int), - void (*ffunc)(vmem_t *, void *, size_t), + void (*ffunc)(vmem_t *, const void *, size_t), vmem_t *source, size_t qcache_max, int vmflag) { int i; @@ -2063,7 +2086,7 @@ vmem_create_common(const char *name, void *base, size_t size, size_t quantum, vmp->vm_kstat.vk_source_id.value.ui32 = source->vm_id; vmp->vm_source = source; vmp->vm_source_alloc = afunc; - vmp->vm_source_free = ffunc; + vmp->vm_source_free = __DECONST(void *, ffunc); /* * Some arenas (like vmem_metadata and kmem_metadata) cannot @@ -2269,39 +2292,78 @@ int vmem_rescale_minshift = 3; /* * Resize vmp's hash table to keep the average lookup depth near 1.0. + * + * The decision to exit early, before allocating a new table, is done outside + * a mutex lock. The calculation of memory that should be allocated and the + * allocation itself is also done outside the lock. The allocation CANNOT be + * safely done under this mutex, and there is no reason to lock the subsequent + * memset. + * + * However, another thread (including ones possible awakened by the + * cv_broadcast() in our caller vmem_update()) can change the number of bytes + * allcated or freed in our vmem arena; enough of a downward change (e.g. from + * reaping after a reduction of ARC frees many scatter ABDs) will cause our + * previous outside-the-lock new_table allocation to be the wrong size, + * potentially leading to a loss of information about vmem_alloc_impl() + * allocations made before we acquire vmp->vm_lock. In turn, this leads + * to a panic when doing a vmem_free_impl() on an improperly-recorded segment. + * + * Consequently once we hold vmp->vm_lock we must recalculate new_size and + * compare that with the previously-calculated nolock_new_size. If they do + * not match we must clean up and return rather than attempt to use new_table. */ static void vmem_hash_rescale(vmem_t *vmp) { - vmem_seg_t **old_table, **new_table, *vsp; - size_t old_size, new_size, h, nseg; + vmem_seg_t **new_table, *vsp; - nseg = (size_t)(vmp->vm_kstat.vk_alloc.value.ui64 - + const size_t nolock_nseg = + (size_t)(vmp->vm_kstat.vk_alloc.value.ui64 - vmp->vm_kstat.vk_free.value.ui64); - new_size = MAX(VMEM_HASH_INITIAL, 1 << (highbit(3 * nseg + 4) - 2)); - old_size = vmp->vm_hash_mask + 1; + const size_t nolock_new_size = MAX(VMEM_HASH_INITIAL, + 1 << (highbit(3 * nolock_nseg + 4) - 2)); + const size_t nolock_old_size = vmp->vm_hash_mask + 1; - if ((old_size >> vmem_rescale_minshift) <= new_size && - new_size <= (old_size << 1)) + if ((nolock_old_size >> vmem_rescale_minshift) <= nolock_new_size && + nolock_new_size <= (nolock_old_size << 1)) return; - new_table = vmem_alloc_impl(vmem_hash_arena, new_size * sizeof (void *), + new_table = vmem_alloc_impl(vmem_hash_arena, + nolock_new_size * sizeof (void *), VM_NOSLEEP); if (new_table == NULL) return; - memset(new_table, 0, new_size * sizeof (void *)); + memset(new_table, 0, nolock_new_size * sizeof (void *)); mutex_enter(&vmp->vm_lock); - old_size = vmp->vm_hash_mask + 1; - old_table = vmp->vm_hash_table; + const size_t nseg = (size_t)(vmp->vm_kstat.vk_alloc.value.ui64 - + vmp->vm_kstat.vk_free.value.ui64); + + const size_t new_size = MAX(VMEM_HASH_INITIAL, + 1 << (highbit(3 * nseg + 4) - 2)); + + if (new_size != nolock_new_size) { + dprintf("ZFS: SPL: %s:%d:%s:" + " race condition found: %s, %ld, %ld\n", + __FILE__, __LINE__, __func__, + vmp->vm_name, + nolock_new_size, new_size); + mutex_exit(&vmp->vm_lock); + vmem_free_impl(vmem_hash_arena, new_table, + nolock_new_size * sizeof (void *)); + return; + } + + const size_t old_size = vmp->vm_hash_mask + 1; + vmem_seg_t **old_table = vmp->vm_hash_table; vmp->vm_hash_mask = new_size - 1; vmp->vm_hash_table = new_table; vmp->vm_hash_shift = highbit(vmp->vm_hash_mask); - for (h = 0; h < old_size; h++) { + for (size_t h = 0; h < old_size; h++) { vsp = old_table[h]; while (vsp != NULL) { uintptr_t addr = vsp->vs_start; @@ -2335,8 +2397,15 @@ vmem_update(void *dummy) * If threads are waiting for resources, wake them up * periodically so they can issue another kmem_reap() * to reclaim resources cached by the slab allocator. + * + * In general it is good practice to take the associated + * lock before calling cv_broadcast(). Here it gives any + * waiters a good shot at the lock that may be (re-)taken + * by this thread in vmem_hash_rescale() function. */ + mutex_enter(&vmp->vm_lock); cv_broadcast(&vmp->vm_cv); + mutex_exit(&vmp->vm_lock); /* * Rescale the hash table to keep the hash chains short. @@ -2383,7 +2452,7 @@ vmem_bucket_number(size_t size) if (bucket < 0) bucket = 0; - return (bucket); + return ((int16_t)bucket); } static inline vmem_t * @@ -2400,353 +2469,125 @@ spl_vmem_bucket_arena_by_size(size_t size) return (vmem_bucket_arena_by_size(size)); } +/* + * We have just freed memory back to Windows so we let any waiters on the + * lowest-level bucket arenas know they have a chance to make progress in + * their hunt for memory from the operating system. We then tell the heap that + * there may be memory freshly imported into the buckets. + * + * This function broadcasts to waiters on the smallest-span buckets first, and + * because of mutex-ordering this biases towards small-allocation kmem caches. + */ static inline void vmem_bucket_wake_all_waiters(void) { for (int i = VMEM_BUCKET_LOWBIT; i < VMEM_BUCKET_HIBIT; i++) { const int bucket = i - VMEM_BUCKET_LOWBIT; vmem_t *bvmp = vmem_bucket_arena[bucket]; + mutex_enter(&bvmp->vm_lock); cv_broadcast(&bvmp->vm_cv); + mutex_exit(&bvmp->vm_lock); } + mutex_enter(&spl_heap_arena->vm_lock); cv_broadcast(&spl_heap_arena->vm_cv); -} - -/* - * xnu_alloc_throttled_bail() : spin looking for memory - * - */ - -static inline void * -xnu_alloc_throttled_bail(uint64_t now_ticks, vmem_t *calling_vmp, - size_t size, int vmflags) -{ - // spin looking for memory - const uint64_t bigtarget = MAX(size, 16ULL*1024ULL*1024ULL); - static volatile _Atomic bool alloc_lock = false; - static volatile _Atomic uint64_t force_time = 0; - - uint64_t timeout_ticks = hz / 2; - if (vmflags & VM_PUSHPAGE) - timeout_ticks = hz / 4; - - uint64_t timeout_time = now_ticks + timeout_ticks; - - for (uint32_t suspends = 0, blocked_suspends = 0, - try_no_pressure = 0; /* empty */; /* empty */) { - if (force_time + timeout_ticks > timeout_time) { - // another thread has forced an allocation - // by timing out. push our deadline into the future. - timeout_time = force_time + timeout_ticks; - } - if (alloc_lock) { - blocked_suspends++; - IOSleep(1); - } else if (spl_vmem_xnu_useful_bytes_free() >= bigtarget) { - bool f = false; - // if alloc_lock == f then alloc_lock = true and result - // is true otherwise result is false and f = true - if (!__c11_atomic_compare_exchange_strong(&alloc_lock, - &f, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED)) { - /* - * avoid (highly unlikely) data race on - * alloc_lock. if alloc_lock has become true - * while we were in the else if expression - * then we effectively optimize away the - * (relaxed) load of alloc_lock (== true) - * into f and continue. - */ - continue; - } - // alloc_lock is now visible as true to all threads - try_no_pressure++; - void *m = spl_vmem_malloc_if_no_pressure(size); - if (m != NULL) { - uint64_t ticks = zfs_lbolt() - now_ticks; - dprintf("SPL: %s returning %llu bytes after " - "%llu ticks (hz=%u, seconds = %llu), " - "%u suspends, %u blocked, %u tries (%s)\n", - __func__, (uint64_t)size, - ticks, hz, ticks/hz, suspends, - blocked_suspends, try_no_pressure, - calling_vmp->vm_name); - // atomic seq cst, so is published to all - // threads - alloc_lock = false; - return (m); - } else { - alloc_lock = false; - spl_free_set_emergency_pressure(bigtarget); - suspends++; - IOSleep(1); - } - } else if (zfs_lbolt() > timeout_time) { - bool f = false; - if (!__c11_atomic_compare_exchange_strong(&alloc_lock, - &f, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED)) { - // avoid (highly unlikely) data race on - // alloc_lock as above - continue; - } - void *mp = spl_vmem_malloc_unconditionally(size); - uint64_t now = zfs_lbolt(); - uint64_t ticks = now - now_ticks; - force_time = now; - dprintf("SPL: %s TIMEOUT %llu bytes after " - "%llu ticks (hz=%u, seconds=%llu), " - "%u suspends, %u blocked, %u tries (%s)\n", - __func__, (uint64_t)size, - ticks, hz, ticks/hz, suspends, - blocked_suspends, try_no_pressure, - calling_vmp->vm_name); - alloc_lock = false; - atomic_inc_64(&spl_xat_forced); - return (mp); - } else { - spl_free_set_emergency_pressure(bigtarget); - suspends++; - IOSleep(1); - } - } + mutex_exit(&spl_heap_arena->vm_lock); } static void * xnu_alloc_throttled(vmem_t *bvmp, size_t size, int vmflag) { - // the caller is one of the bucket arenas. - // null_vmp will be spl_default_arena_parent, which is - // just a placeholder. - - uint64_t now = zfs_lbolt(); - const uint64_t entry_now = now; + static volatile _Atomic uint64_t fail_at = 0; + static volatile _Atomic int16_t success_ct = 0; - void *m = spl_vmem_malloc_if_no_pressure(size); + void *p = spl_vmem_malloc_unconditionally_unlocked(size); - if (m != NULL) { - atomic_inc_64(&spl_xat_success); - spl_xat_lastalloc = gethrtime(); - // wake up waiters on all the arena condvars - // since there is apparently no memory shortage. - vmem_bucket_wake_all_waiters(); - return (m); - } else { - spl_free_set_emergency_pressure((int64_t)size); - } - - if (vmflag & VM_PANIC) { - // force an allocation now to avoid a panic + if (p != NULL) { + /* grow fail_at periodically */ + if (success_ct++ >= 128) { + fail_at += size; + success_ct = 0; + } spl_xat_lastalloc = gethrtime(); - spl_free_set_emergency_pressure(4LL * (int64_t)size); - void *p = spl_vmem_malloc_unconditionally(size); - // p cannot be NULL (unconditional kernel malloc always works - // or panics) - // therefore: success, wake all waiters on alloc|free condvar - // wake up arena waiters to let them know there is memory - // available in the arena; let waiters on other bucket arenas - // continue sleeping. cv_broadcast(&bvmp->vm_cv); return (p); } - if (vmflag & VM_NOSLEEP) { - spl_free_set_emergency_pressure(MAX(2LL * (int64_t)size, - 16LL*1024LL*1024LL)); - /* cheating a bit, but not really waiting */ - kpreempt(KPREEMPT_SYNC); - void *p = spl_vmem_malloc_if_no_pressure(size); - if (p != NULL) { - atomic_inc_64(&spl_xat_late_success_nosleep); - cv_broadcast(&bvmp->vm_cv); - spl_xat_lastalloc = gethrtime(); - } - // if p == NULL, then there will be an increment in - // the fail kstat - return (p); - } + success_ct = 0; + fail_at = segkmem_total_mem_allocated - size; /* - * Loop for a while trying to satisfy VM_SLEEP allocations. - * - * If we are able to allocate memory, then return the pointer. - * - * We return NULL if some other thread's activity has caused - * sufficient memory to appear in this arena that we can satisfy - * the allocation. - * - * We call xnu_alloc_throttle_bail() after a few milliseconds of - * waiting; it will either return a pointer to newly allocated - * memory or NULL. We return the result. + * adjust dynamic memory cap downwards by 1/32 (~ 3%) of total_memory + * but do not drop below 1/8 of total_memory.. * + * see also spl-kmem.c:spl_reduce_dynamic_cap(), which is + * triggered by ARC or other clients inquiring about spl_free() */ + if (spl_enforce_memory_caps != 0 && + (fail_at < spl_dynamic_memory_cap || + spl_dynamic_memory_cap == 0)) { + mutex_enter(&spl_dynamic_memory_cap_lock); + + spl_dynamic_memory_cap_last_downward_adjust = gethrtime(); + const int64_t thresh = total_memory >> 3; + const int64_t below_fail_at = fail_at - (total_memory >> 5); + const int64_t reduced = MAX(below_fail_at, thresh); + + if (spl_dynamic_memory_cap == 0 || + spl_dynamic_memory_cap >= total_memory) { + spl_dynamic_memory_cap = reduced; + atomic_inc_64(&spl_dynamic_memory_cap_reductions); + } else if (thresh > spl_dynamic_memory_cap) { + spl_dynamic_memory_cap = thresh; + atomic_inc_64(&spl_dynamic_memory_cap_hit_floor); + } else { + spl_dynamic_memory_cap = reduced; + atomic_inc_64(&spl_dynamic_memory_cap_reductions); + } - const uint32_t bucket_number = - vmem_bucket_id_to_bucket_number[bvmp->vm_id]; - static volatile _Atomic uint32_t waiters = 0; - - waiters++; - - if (waiters == 1UL) - atomic_inc_64(&spl_xat_no_waiters); - - static _Atomic uint32_t max_waiters_seen = 0; - - if (waiters > max_waiters_seen) { - max_waiters_seen = waiters; - dprintf("SPL: %s: max_waiters_seen increased to %u\n", __func__, - max_waiters_seen); + mutex_exit(&spl_dynamic_memory_cap_lock); } - boolean_t local_xat_pressured = false; + /* wait until used memory falls below failure_at */ - for (; /* empty */; /* empty */) { - clock_t wait_time = USEC2NSEC(500UL * MAX(waiters, 1UL)); - mutex_enter(&bvmp->vm_lock); - spl_xat_sleep++; - if (local_xat_pressured) { - spl_xat_pressured++; - local_xat_pressured = false; - } - (void) cv_timedwait_hires(&bvmp->vm_cv, &bvmp->vm_lock, - wait_time, 0, 0); - mutex_exit(&bvmp->vm_lock); - now = zfs_lbolt(); - // We may be here because of a broadcast to &vmp->vm_cv, - // causing xnu to schedule all the sleepers in priority-weighted - // FIFO order. Because of the mutex_exit(), the sections below - // here may be entered concurrently. - // spl_vmem_malloc_if_no_pressure does a mutex, so avoid calling - // it unless there is a chance it will succeed. - if (spl_vmem_xnu_useful_bytes_free() > (MAX(size, - 16ULL*1024ULL*1024ULL))) { - void *a = spl_vmem_malloc_if_no_pressure(size); - if (a != NULL) { - atomic_inc_64(&spl_xat_late_success); - spl_xat_lastalloc = gethrtime(); - waiters--; - // Wake up all waiters on the bucket arena - // locks, since the system apparently has - // memory again. - vmem_bucket_wake_all_waiters(); - return (a); - } else { - // Probably spl_vm_page_free_count changed while - // we were in the mutex queue in - // spl_vmem_malloc_if_no_pressure(). There is - // therefore no point in doing the bail-out - // check below, so go back to the top of the - // for loop. - atomic_inc_64(&spl_xat_late_deny); - continue; - } - } - if (now > entry_now + hz / 4 || - spl_vba_threads[bucket_number] > 1UL) { - // If there are other threads waiting for us - // in vba() then when we satisfy this allocation, - // we satisfy more than one thread, so invoke XATB(). - // Otherwise, if we have had no luck for 250 ms, then - // switch to XATB() which is much more aggressive. - if (spl_vba_threads[bucket_number] > 1UL) - atomic_inc_64(&spl_xat_bailed_contended); - atomic_inc_64(&spl_xat_bailed); - static _Atomic uint32_t bailing_threads = 0; - static _Atomic uint32_t max_bailers_seen = 0; - bailing_threads++; - if (bailing_threads > max_bailers_seen) { - max_bailers_seen = bailing_threads; - dprintf("SPL: %s: max_bailers_seen increased " - "to %u\n", __func__, max_bailers_seen); - } - void *b = - xnu_alloc_throttled_bail(now, bvmp, size, vmflag); - bailing_threads--; - spl_xat_lastalloc = gethrtime(); - // wake up waiters on the arena lock, - // since they now have memory they can use. - cv_broadcast(&bvmp->vm_cv); - // open turnstile after having bailed, rather - // than before - waiters--; - return (b); - } else if (now - entry_now > 0 && - ((now - entry_now) % (hz/10))) { - spl_free_set_emergency_pressure(MAX(size, - 16LL*1024LL*1024LL)); - local_xat_pressured = true; + extern void spl_set_arc_no_grow(int); + spl_set_arc_no_grow(B_TRUE); + spl_free_set_emergency_pressure(total_memory >> 7LL); + atomic_inc_64(&spl_xat_pressured); + if ((vmflag & (VM_NOSLEEP | VM_PANIC | VM_ABORT)) > 0) + return (NULL); + + for (uint64_t loop_for_mem = 1; ; loop_for_mem++) { + // ASSERT3U((loop_for_mem % 10), ==, 0); // 1 second bleat beat + IOSleep(100); /* hope someone frees memory */ + /* only try to allocate if there is memory */ + if (fail_at > segkmem_total_mem_allocated) { + p = spl_vmem_malloc_unconditionally_unlocked(size); + if (p != NULL) + return (p); + } else { + /* abuse existing kstat */ + atomic_inc_64(&spl_xat_sleep); } + success_ct = 0; + const uint64_t x = segkmem_total_mem_allocated - size; + if (fail_at > x) + fail_at = x; + spl_set_arc_no_grow(B_TRUE); + spl_free_set_emergency_pressure(total_memory >> 7LL); + atomic_inc_64(&spl_xat_pressured); + /* after ten seconds, just return NULL */ + if (loop_for_mem > 100) + return (NULL); } } static void -xnu_free_throttled(vmem_t *vmp, void *vaddr, size_t size) +xnu_free_throttled(vmem_t *vmp, const void *vaddr, size_t size) { - extern void osif_free(void *, uint64_t); + extern void osif_free(const void *, uint64_t); - // Serialize behind a (short) spin-sleep delay, giving - // xnu time to do freelist management and - // PT teardowns - - // In the usual case there is only one thread in this function, - // so we proceed waitlessly to osif_free(). - - // When there are multiple threads here, we delay the 2nd and later. - - // Explict race: - // The osif_free() is not protected by the vmem_xnu_alloc_lock - // mutex; that is just used for implementing the delay. Consequently, - // the waiters on the same lock in spl_vmem_malloc_if_no_pressure may - // falsely see too small a value for spl_vm_page_free_count. We don't - // care in part because xnu performs poorly when doing - // free-then-allocate anwyay. - - // a_waiters gauges the loop exit checking and sleep duration; - // it is a count of the number of threads trying to do work - // in this function. - static volatile _Atomic uint32_t a_waiters = 0; - - // is_freeing protects the osif_free() call; see comment below - static volatile _Atomic bool is_freeing = false; - - a_waiters++; // generates "lock incl ..." - - static _Atomic uint32_t max_waiters_seen = 0; - - if (a_waiters > max_waiters_seen) { - max_waiters_seen = a_waiters; - dprintf("SPL: %s: max_waiters_seen increased to %u\n", - __func__, max_waiters_seen); - } - - for (uint32_t iter = 0; a_waiters > 1UL; iter++) { - // there is more than one thread here, so suspend and - // sleep for 1 ms - atomic_inc_64(&spl_xft_wait); - IOSleep(1); - // If are growing old in this loop, then see if - // anyone else is still in osif_free. If not, - // we can exit. - if (iter >= a_waiters) { - // if is_freeing == f, then set is_freeing to true with - // release semantics (i.e. "push" it to other cores) - // then break; otherwise, set f to true relaxedly (i.e., - // optimize it out) - bool f = false; - if (__c11_atomic_compare_exchange_weak(&is_freeing, - &f, true, __ATOMIC_RELEASE, __ATOMIC_RELAXED)) { - break; - } - } - } - // If there is more than one thread in this function, osif_free() is - // protected by is_freeing. Release it after the osif_free() - // call has been made and the lastfree bookkeeping has been done. osif_free(vaddr, size); spl_xat_lastfree = gethrtime(); - is_freeing = false; - a_waiters--; - kpreempt(KPREEMPT_SYNC); - // since we just gave back xnu enough to satisfy an allocation - // in at least the smaller buckets, let's wake up anyone in - // the cv_wait() in vmem_xalloc([bucket_#], ...) vmem_bucket_wake_all_waiters(); } @@ -2797,13 +2638,30 @@ vmem_bucket_alloc(vmem_t *null_vmp, size_t size, const int vmflags) vmem_t *bvmp = vmem_bucket_arena_by_size(size); + void *fastm = vmem_alloc_impl(bvmp, size, + local_hipriority_allocator ? vmflags : vmflags | VM_BESTFIT); + + if (fastm != NULL) { + atomic_inc_64(&spl_vba_fastpath); + cv_broadcast(&calling_arena->vm_cv); + return (fastm); + } else if ((vmflags & (VM_NOSLEEP | VM_PANIC | VM_ABORT)) > 0) { + atomic_inc_64(&spl_vba_fastexit); + return (NULL); + } + + atomic_inc_64(&spl_vba_slowpath); + + /* work harder to avoid an allocation */ + const int slow_vmflags = vmflags | VM_BESTFIT; + // there are 13 buckets, so use a 16-bit scalar to hold // a set of bits, where each bit corresponds to an in-progress // vmem_alloc_impl(bucket, ...) below. static volatile _Atomic uint16_t buckets_busy_allocating = 0; const uint16_t bucket_number = vmem_bucket_number(size); - const uint16_t bucket_bit = (uint16_t)1 << bucket_number; + const uint16_t bucket_bit = (uint16_t)(1 << bucket_number); spl_vba_threads[bucket_number]++; @@ -2814,12 +2672,13 @@ vmem_bucket_alloc(vmem_t *null_vmp, size_t size, const int vmflags) bool loop_once = false; - if ((vmflags & (VM_NOSLEEP | VM_PANIC | VM_ABORT)) == 0 && + if ((slow_vmflags & (VM_NOSLEEP | VM_PANIC | VM_ABORT)) == 0 && ! vmem_canalloc_atomic(bvmp, size)) { if (spl_vmem_xnu_useful_bytes_free() < (MAX(size, 16ULL*1024ULL*1024ULL))) { - spl_free_set_emergency_pressure(size); - IOSleep(1); + spl_free_set_emergency_pressure( + total_memory >> 7LL); + IOSleep(2); if (!vmem_canalloc_atomic(bvmp, size) && (spl_vmem_xnu_useful_bytes_free() < (MAX(size, 16ULL*1024ULL*1024ULL)))) { @@ -2875,11 +2734,11 @@ vmem_bucket_alloc(vmem_t *null_vmp, size_t size, const int vmflags) loop_once = false; // non-waiting allocations should proceeed to vmem_alloc_impl() // immediately - if (vmflags & (VM_NOSLEEP | VM_PANIC | VM_ABORT)) { + if (slow_vmflags & (VM_NOSLEEP | VM_PANIC | VM_ABORT)) { break; } if (vmem_canalloc_atomic(bvmp, size)) { - // We can probably vmem_alloc(bvmp, size, vmflags). + // We can probably vmem_alloc(bvmp, size, slow_vmflags). // At worst case it will give us a NULL and we will // end up on the vmp's cv_wait. // @@ -2990,7 +2849,7 @@ vmem_bucket_alloc(vmem_t *null_vmp, size_t size, const int vmflags) timedout |= 2; extern uint64_t real_total_memory; spl_free_set_emergency_pressure( - real_total_memory / 64LL); + total_memory >> 7LL); // flush the current thread in xat() out of // xat()'s for() loop and into xat_bail() cv_broadcast(&bvmp->vm_cv); @@ -3083,7 +2942,7 @@ vmem_bucket_alloc(vmem_t *null_vmp, size_t size, const int vmflags) // because waiters was 0 when we entered this function, // subsequent callers will enter the for loop. - void *m = vmem_alloc_impl(bvmp, size, vmflags); + void *m = vmem_alloc_impl(bvmp, size, slow_vmflags); // allow another vmem_canalloc() through for this bucket // by atomically turning off the appropriate bit @@ -3118,7 +2977,7 @@ vmem_bucket_alloc(vmem_t *null_vmp, size_t size, const int vmflags) } static void -vmem_bucket_free(vmem_t *null_vmp, void *vaddr, size_t size) +vmem_bucket_free(vmem_t *null_vmp, const void *vaddr, size_t size) { vmem_t *calling_arena = spl_heap_arena; @@ -3150,7 +3009,7 @@ vmem_buckets_size(int typemask) { int64_t total_size = 0; - for (int i = 0; i < VMEM_BUCKETS; i++) { + for (uint16_t i = 0; i < VMEM_BUCKETS; i++) { int64_t u = vmem_bucket_arena_used(i); int64_t f = vmem_bucket_arena_free(i); if (typemask & VMEM_ALLOC) @@ -3277,9 +3136,9 @@ spl_vmem_default_alloc(vmem_t *vmp, size_t size, int vmflags) } static void -spl_vmem_default_free(vmem_t *vmp, void *vaddr, size_t size) +spl_vmem_default_free(vmem_t *vmp, const void *vaddr, size_t size) { - extern void osif_free(void *, uint64_t); + extern void osif_free(const void *, uint64_t); osif_free(vaddr, size); } @@ -3287,7 +3146,7 @@ vmem_t * vmem_init(const char *heap_name, void *heap_start, size_t heap_size, size_t heap_quantum, void *(*heap_alloc)(vmem_t *, size_t, int), - void (*heap_free)(vmem_t *, void *, size_t)) + void (*heap_free)(vmem_t *, const void *, size_t)) { uint32_t id; int nseg = VMEM_SEG_INITIAL; @@ -3313,7 +3172,7 @@ vmem_init(const char *heap_name, vmem_putseg_global(&vmem_seg0[nseg]); /* - * On OSX we ultimately have to use the OS allocator + * On Windows we ultimately have to use the OS allocator * as the ource and sink of memory as it is allocated * and freed. * @@ -3334,8 +3193,13 @@ vmem_init(const char *heap_name, // Intel can go with 4096 alignment, but arm64 needs 16384. So // we just use the larger. // turns out that Windows refuses alignment over 8192 - __declspec(align(PAGE_SIZE)) static char - initial_default_block[16ULL * 1024ULL * 1024ULL] = { 0 }; + // __declspec(align(PAGE_SIZE)) static char + // initial_default_block[INITIAL_BLOCK_SIZE] = { 0 }; + // ExAllocatePoolWithTag() + // If NumberOfBytes is PAGE_SIZE or greater, a page-aligned buffer + // is allocated + MALLOC(initial_default_block, void *, INITIAL_BLOCK_SIZE, M_TEMP, + M_WAITOK); // The default arena is very low-bandwidth; it supplies the initial // large allocation for the heap arena below, and it serves as the @@ -3343,9 +3207,9 @@ vmem_init(const char *heap_name, // or 3 parent_alloc calls (to spl_vmem_default_alloc) in total. spl_default_arena = vmem_create("spl_default_arena", // id 1 - initial_default_block, 16ULL*1024ULL*1024ULL, + initial_default_block, INITIAL_BLOCK_SIZE, heap_quantum, spl_vmem_default_alloc, spl_vmem_default_free, - spl_default_arena_parent, 16ULL*1024ULL*1024ULL, + spl_default_arena_parent, 32, /* minimum import */ VM_SLEEP | VMC_POPULATOR | VMC_NO_QCACHE); VERIFY(spl_default_arena != NULL); @@ -3377,17 +3241,15 @@ vmem_init(const char *heap_name, extern uint64_t real_total_memory; VERIFY3U(real_total_memory, >=, 1024ULL*1024ULL*1024ULL); - // adjust minimum bucket span size for memory size - // see comments in the switch below - // large span: 1 MiB and bigger on large-memory (> 32 GiB) systems - // small span: 256 kiB and bigger on large-memory systems - const uint64_t k = 1024ULL; - const uint64_t qm = 256ULL * k; - const uint64_t m = 1024ULL* k; - const uint64_t big = MAX(real_total_memory / (k * 32ULL), m); - const uint64_t small = MAX(real_total_memory / (k * 128ULL), qm); - spl_bucket_tunable_large_span = MIN(big, 16ULL * m); - spl_bucket_tunable_small_span = small; + /* + * Minimum bucket span size, which is what we ask IOMallocAligned for. + * See comments in the switch statement below. + * + * By default ask the kernel for at least 128kiB allocations. + */ + spl_bucket_tunable_large_span = spl_bucket_tunable_small_span = + 128ULL * 1024UL; + dprintf("SPL: %s: real_total_memory %llu, large spans %llu, small " "spans %llu\n", __func__, real_total_memory, spl_bucket_tunable_large_span, spl_bucket_tunable_small_span); @@ -3397,8 +3259,10 @@ vmem_init(const char *heap_name, for (int32_t i = VMEM_BUCKET_LOWBIT; i <= VMEM_BUCKET_HIBIT; i++) { const uint64_t bucket_largest_size = (1ULL << (uint64_t)i); + (void) snprintf(buf, VMEM_NAMELEN + 20, "%s_%llu", "bucket", bucket_largest_size); + dprintf("SPL: %s creating arena %s (i == %d)\n", __func__, buf, i); const int bucket_number = i - VMEM_BUCKET_LOWBIT; @@ -3411,13 +3275,14 @@ vmem_init(const char *heap_name, * bucket_heap arena. */ vmem_t *b = vmem_create(buf, NULL, 0, - // MAX(heap_quantum, bucket_largest_size), heap_quantum, xnu_alloc_throttled, xnu_free_throttled, spl_default_arena_parent, - MAX(heap_quantum * 8, bucket_largest_size * 2), + 32, /* minimum import */ VM_SLEEP | VMC_POPULATOR | VMC_NO_QCACHE | VMC_TIMEFREE); + VERIFY(b != NULL); + b->vm_source = b; vmem_bucket_arena[bucket_number] = b; vmem_bucket_id_to_bucket_number[b->vm_id] = bucket_number; @@ -3443,17 +3308,12 @@ vmem_init(const char *heap_name, // kstat.vmem.vmem.bucket_heap.parent_{alloc+free}, and improves with // increasing initial fixed allocation size. - const size_t mib = 1024ULL * 1024ULL; - const size_t gib = 1024ULL * mib; - size_t resv_size = 128ULL * mib; - extern uint64_t real_total_memory; + /* + * Add an initial segment to spl_heap_arena for convenience. + */ - if (real_total_memory >= 4ULL * gib) - resv_size = 256ULL * mib; - if (real_total_memory >= 8ULL * gib) - resv_size = 512ULL * mib; - if (real_total_memory >= 16ULL * gib) - resv_size = gib; + const size_t mib = 1024ULL * 1024ULL; + const size_t resv_size = 128ULL * mib; dprintf("SPL: %s adding fixed allocation of %llu to the bucket_heap\n", __func__, (uint64_t)resv_size); @@ -3465,6 +3325,7 @@ vmem_init(const char *heap_name, VERIFY(spl_heap_arena_initial_alloc != NULL); + /* remember size we allocated */ spl_heap_arena_initial_alloc_size = resv_size; // kstat.vmem.vmem.heap : kmem_cache_alloc() and similar calls @@ -3484,7 +3345,12 @@ vmem_init(const char *heap_name, vmem_metadata_arena = vmem_create("vmem_metadata", // id 17 NULL, 0, heap_quantum, vmem_alloc_impl, vmem_free_impl, spl_default_arena, - 8 * PAGESIZE, VM_SLEEP | VMC_POPULATOR | VMC_NO_QCACHE); +#ifdef __arm64__ + 2 * PAGESIZE, +#else + 8 * PAGESIZE, +#endif + VM_SLEEP | VMC_POPULATOR | VMC_NO_QCACHE); VERIFY(vmem_metadata_arena != NULL); @@ -3547,13 +3413,11 @@ static void vmem_fini_freelist(void *vmp, void *start, size_t size) void vmem_free_span_list(void) { - int total = 0; - int total_count = 0; + int total __maybe_unused = 0; struct free_slab *fs; // int release = 1; while ((fs = list_head(&freelist))) { - total_count++; total += fs->slabsize; list_remove(&freelist, fs); /* @@ -3753,6 +3617,9 @@ vmem_fini(vmem_t *heap) dprintf("SPL: %s destroying vmem_vmem_arena\n", __func__); vmem_destroy_internal(vmem_vmem_arena); + dprintf("SPL: %s: freeing initial_default_block\n", __func__); + FREE(initial_default_block, M_TEMP); + dprintf("SPL: arenas removed, now try destroying mutexes... "); dprintf("vmem_xnu_alloc_lock "); @@ -3788,8 +3655,15 @@ vmem_fini(vmem_t *heap) // segkmem_free(fs->vmp, fs->slab, fs->slabsize); FREE(fs, M_TEMP); } - dprintf("SPL: WOULD HAVE released %llu bytes (%llu spans) from" - " arenas\n", total, total_count); + if (total != 0 && total_count != 0) { + dprintf("SPL: %s:%d: WOULD HAVE released %llu bytes" + " (%llu spans) from arenas\n", + __func__, __LINE__, total, total_count); + } else { + dprintf("SPL: %s:%d good," + " did not have to force release any vmem spans", + __func__, __LINE__); + } list_destroy(&freelist); dprintf("SPL: %s: Brief delay for readability...\n", __func__); delay(hz); diff --git a/module/os/windows/zfs/abd_os.c b/module/os/windows/zfs/abd_os.c index bdf16dcaf21d..952e8573666d 100644 --- a/module/os/windows/zfs/abd_os.c +++ b/module/os/windows/zfs/abd_os.c @@ -13,6 +13,7 @@ * Copyright (c) 2014 by Chunwei Chen. All rights reserved. * Copyright (c) 2016 by Delphix. All rights reserved. * Copyright (c) 2020 by Jorgen Lundman. All rights reserved. + * Copyright (c) 2021 by Sean Doran. All rights reserved. */ /* @@ -32,7 +33,9 @@ #include #include #include -#include +#ifdef DEBUG +#include +#endif typedef struct abd_stats { kstat_named_t abdstat_struct_size; @@ -87,11 +90,42 @@ struct { * will cause the machine to panic if you change it and try to access the data * within a scattered ABD. */ -size_t zfs_abd_chunk_size = 4096; -lookasidelist_cache_t *abd_chunk_cache; +#if defined(__arm64__) +/* + * On ARM macOS, PAGE_SIZE is not a runtime constant! So here we have to + * guess at compile time. There a balance between fewer kmem_caches, more + * memory use by "tails" of medium-sized ABDs, and more memory use by + * accounting structures if we use 4k versus 16k. + * + * Since the original *subpage* design expected PAGE_SIZE to be constant and + * the pre-subpage ABDs used PAGE_SIZE without requiring it to be a + * compile-time constant, let's use 16k initially and adjust downwards based + * on feedback. + */ +#define ABD_PGSIZE 16384 +#else +#define ABD_PGSIZE PAGE_SIZE +#endif + +const static size_t zfs_abd_chunk_size = ABD_PGSIZE; + +kmem_cache_t *abd_chunk_cache; static kstat_t *abd_ksp; +/* + * Sub-ABD_PGSIZE allocations are segregated into kmem caches. This may be + * inefficient or counterproductive if in future the following conditions are + * not met. + */ +_Static_assert(SPA_MINBLOCKSHIFT == 9, "unexpected SPA_MINSBLOCKSHIFT != 9"); +_Static_assert(ISP2(ABD_PGSIZE), "ABD_PGSIZE unexpectedly non power of 2"); +_Static_assert(ABD_PGSIZE >= 4096, "ABD_PGSIZE unexpectedly smaller than 4096"); +_Static_assert(ABD_PGSIZE <= 16384, + "ABD_PGSIZE unexpectedly larger than 16384"); + +#define SUBPAGE_CACHE_INDICES (ABD_PGSIZE >> SPA_MINBLOCKSHIFT) +kmem_cache_t *abd_subpage_cache[SUBPAGE_CACHE_INDICES] = { NULL }; /* * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose chunks are @@ -105,19 +139,19 @@ static char *abd_zero_buf = NULL; static void abd_free_chunk(void *c) { - lookasidelist_cache_free(abd_chunk_cache, c); + kmem_cache_free(abd_chunk_cache, c); } -static size_t +static inline size_t abd_chunkcnt_for_bytes(size_t size) { return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size); } -static inline size_t +static size_t abd_scatter_chunkcnt(abd_t *abd) { - ASSERT(!abd_is_linear(abd)); + VERIFY(!abd_is_linear(abd)); return (abd_chunkcnt_for_bytes( ABD_SCATTER(abd).abd_offset + abd->abd_size)); } @@ -125,7 +159,7 @@ abd_scatter_chunkcnt(abd_t *abd) boolean_t abd_size_alloc_linear(size_t size) { - return (size <= zfs_abd_chunk_size ? B_TRUE : B_FALSE); + return (B_FALSE); } void @@ -137,12 +171,12 @@ abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) ABDSTAT_BUMP(abdstat_scatter_cnt); ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size); ABDSTAT_INCR(abdstat_scatter_chunk_waste, - n * zfs_abd_chunk_size - abd->abd_size); + n * ABD_SCATTER(abd).abd_chunk_size - abd->abd_size); } else { ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); ABDSTAT_INCR(abdstat_scatter_chunk_waste, - abd->abd_size - n * zfs_abd_chunk_size); + abd->abd_size - n * ABD_SCATTER(abd).abd_chunk_size); } } @@ -169,31 +203,87 @@ abd_verify_scatter(abd_t *abd) VERIFY(!abd_is_linear_page(abd)); VERIFY3U(ABD_SCATTER(abd).abd_offset, <, zfs_abd_chunk_size); + VERIFY3U(ABD_SCATTER(abd).abd_offset, <, + ABD_SCATTER(abd).abd_chunk_size); + VERIFY3U(ABD_SCATTER(abd).abd_chunk_size, >=, + SPA_MINBLOCKSIZE); size_t n = abd_scatter_chunkcnt(abd); + + if (ABD_SCATTER(abd).abd_chunk_size != ABD_PGSIZE) { + VERIFY3U(n, ==, 1); + VERIFY3U(ABD_SCATTER(abd).abd_chunk_size, <, ABD_PGSIZE); + VERIFY3U(abd->abd_size, <=, ABD_SCATTER(abd).abd_chunk_size); + } + for (int i = 0; i < n; i++) { - ASSERT3P( + VERIFY3P( ABD_SCATTER(abd).abd_chunks[i], !=, NULL); } } +static inline int +abd_subpage_cache_index(const size_t size) +{ + const int idx = size >> SPA_MINBLOCKSHIFT; + + if ((size % SPA_MINBLOCKSIZE) == 0) + return (idx - 1); + else + return (idx); +} + +static inline uint_t +abd_subpage_enclosing_size(const int i) +{ + return (SPA_MINBLOCKSIZE * (i + 1)); +} + void abd_alloc_chunks(abd_t *abd, size_t size) { - size_t n = abd_chunkcnt_for_bytes(size); - for (int i = 0; i < n; i++) { - void *c = lookasidelist_cache_alloc(abd_chunk_cache); - ABD_SCATTER(abd).abd_chunks[i] = c; + VERIFY3U(size, >, 0); + if (size <= (zfs_abd_chunk_size - SPA_MINBLOCKSIZE)) { + const int i = abd_subpage_cache_index(size); + VERIFY3S(i, >=, 0); + VERIFY3S(i, <, SUBPAGE_CACHE_INDICES); + const uint_t s = abd_subpage_enclosing_size(i); + VERIFY3U(s, >=, size); + VERIFY3U(s, <, zfs_abd_chunk_size); + void *c = kmem_cache_alloc(abd_subpage_cache[i], KM_SLEEP); + ABD_SCATTER(abd).abd_chunks[0] = c; + ABD_SCATTER(abd).abd_chunk_size = s; + } else { + const size_t n = abd_chunkcnt_for_bytes(size); + + for (int i = 0; i < n; i++) { + void *c = kmem_cache_alloc(abd_chunk_cache, KM_SLEEP); + ABD_SCATTER(abd).abd_chunks[i] = c; + } + ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size; } - ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size; } void abd_free_chunks(abd_t *abd) { - size_t n = abd_scatter_chunkcnt(abd); - for (int i = 0; i < n; i++) { - abd_free_chunk(ABD_SCATTER(abd).abd_chunks[i]); + const uint_t abd_cs = ABD_SCATTER(abd).abd_chunk_size; + + if (abd_cs <= (zfs_abd_chunk_size - SPA_MINBLOCKSIZE)) { + VERIFY3U(abd->abd_size, <, zfs_abd_chunk_size); + VERIFY0(P2PHASE(abd_cs, SPA_MINBLOCKSIZE)); + + const int idx = abd_subpage_cache_index(abd_cs); + VERIFY3S(idx, >=, 0); + VERIFY3S(idx, <, SUBPAGE_CACHE_INDICES); + + kmem_cache_free(abd_subpage_cache[idx], + ABD_SCATTER(abd).abd_chunks[0]); + } else { + const size_t n = abd_scatter_chunkcnt(abd); + for (int i = 0; i < n; i++) { + abd_free_chunk(ABD_SCATTER(abd).abd_chunks[i]); + } } } @@ -236,7 +326,7 @@ static void abd_alloc_zero_scatter(void) { size_t n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); - abd_zero_buf = lookasidelist_cache_alloc(abd_chunk_cache); + abd_zero_buf = kmem_cache_alloc(abd_chunk_cache, KM_SLEEP); memset(abd_zero_buf, 0, zfs_abd_chunk_size); abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); @@ -264,28 +354,142 @@ abd_free_zero_scatter(void) abd_free_struct(abd_zero_scatter); abd_zero_scatter = NULL; - lookasidelist_cache_free(abd_chunk_cache, abd_zero_buf); + kmem_cache_free(abd_chunk_cache, abd_zero_buf); +} + +static int +abd_kstats_update(kstat_t *ksp, int rw) +{ + abd_stats_t *as = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + as->abdstat_struct_size.value.ui64 = + wmsum_value(&abd_sums.abdstat_struct_size); + as->abdstat_scatter_cnt.value.ui64 = + wmsum_value(&abd_sums.abdstat_scatter_cnt); + as->abdstat_scatter_data_size.value.ui64 = + wmsum_value(&abd_sums.abdstat_scatter_data_size); + as->abdstat_scatter_chunk_waste.value.ui64 = + wmsum_value(&abd_sums.abdstat_scatter_chunk_waste); + as->abdstat_linear_cnt.value.ui64 = + wmsum_value(&abd_sums.abdstat_linear_cnt); + as->abdstat_linear_data_size.value.ui64 = + wmsum_value(&abd_sums.abdstat_linear_data_size); + return (0); } void abd_init(void) { - abd_chunk_cache = lookasidelist_cache_create("abd_chunk", - zfs_abd_chunk_size); + /* check if we guessed ABD_PGSIZE correctly */ + ASSERT3U(ABD_PGSIZE, ==, PAGE_SIZE); + +#ifdef DEBUG + /* + * KMF_BUFTAG | KMF_LITE on the abd kmem_caches causes them to waste + * up to 50% of their memory for redzone. Even in DEBUG builds this + * therefore should be KMC_NOTOUCH unless there are concerns about + * overruns, UAFs, etc involving abd chunks or subpage chunks. + * + * Additionally these KMF_ + * flags require the definitions from + */ + + /* + * DEBUGGING: do this + * const int cflags = KMF_BUFTAG | KMF_LITE; + * or + * const int cflags = KMC_ARENA_SLAB; + */ + + int cflags = KMC_ARENA_SLAB; +#else + int cflags = KMC_ARENA_SLAB; +#endif + +#ifdef _KERNEL +/* This must all match spl-seg_kmem.c : segkmem_abd_init() */ +#define SMALL_RAM_MACHINE (4ULL * 1024ULL * 1024ULL * 1024ULL) + + extern uint64_t total_memory; + + if (total_memory < SMALL_RAM_MACHINE) { + cflags = KMC_NOTOUCH; + } +#endif + + abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, + ABD_PGSIZE, + NULL, NULL, NULL, NULL, abd_arena, cflags); + + wmsum_init(&abd_sums.abdstat_struct_size, 0); + wmsum_init(&abd_sums.abdstat_scatter_cnt, 0); + wmsum_init(&abd_sums.abdstat_scatter_data_size, 0); + wmsum_init(&abd_sums.abdstat_scatter_chunk_waste, 0); + wmsum_init(&abd_sums.abdstat_linear_cnt, 0); + wmsum_init(&abd_sums.abdstat_linear_data_size, 0); abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (abd_ksp != NULL) { abd_ksp->ks_data = &abd_stats; + abd_ksp->ks_update = abd_kstats_update; kstat_install(abd_ksp); } abd_alloc_zero_scatter(); + + /* + * Check at compile time that SPA_MINBLOCKSIZE is 512, because we want + * to build sub-page-size linear ABD kmem caches at multiples of + * SPA_MINBLOCKSIZE. If SPA_MINBLOCKSIZE ever changes, a different + * layout should be calculated at runtime. + * + * See also the assertions above the definition of abd_subpbage_cache. + */ + + _Static_assert(SPA_MINBLOCKSIZE == 512, + "unexpected SPA_MINBLOCKSIZE != 512"); + + const int step_size = SPA_MINBLOCKSIZE; + for (int bytes = step_size; bytes < ABD_PGSIZE; bytes += step_size) { + char name[36]; + + (void) snprintf(name, sizeof (name), + "abd_subpage_%lu", (ulong_t)bytes); + + const int index = (bytes >> SPA_MINBLOCKSHIFT) - 1; + VERIFY3S(index, >=, 0); + VERIFY3S(index, <, SUBPAGE_CACHE_INDICES); + +#ifdef DEBUG + int csubflags = KMF_LITE; +#else + int csubflags = 0; +#endif +#ifdef _KERNEL + if (total_memory < SMALL_RAM_MACHINE) + csubflags = cflags; +#endif + abd_subpage_cache[index] = + kmem_cache_create(name, bytes, sizeof (void *), + NULL, NULL, NULL, NULL, abd_subpage_arena, csubflags); + + VERIFY3P(abd_subpage_cache[index], !=, NULL); + } } void abd_fini(void) { + const int step_size = SPA_MINBLOCKSIZE; + for (int bytes = step_size; bytes < ABD_PGSIZE; bytes += step_size) { + const int index = (bytes >> SPA_MINBLOCKSHIFT) - 1; + kmem_cache_destroy(abd_subpage_cache[index]); + abd_subpage_cache[index] = NULL; + } + abd_free_zero_scatter(); if (abd_ksp != NULL) { @@ -293,7 +497,14 @@ abd_fini(void) abd_ksp = NULL; } - lookasidelist_cache_destroy(abd_chunk_cache); + wmsum_fini(&abd_sums.abdstat_struct_size); + wmsum_fini(&abd_sums.abdstat_scatter_cnt); + wmsum_fini(&abd_sums.abdstat_scatter_data_size); + wmsum_fini(&abd_sums.abdstat_scatter_chunk_waste); + wmsum_fini(&abd_sums.abdstat_linear_cnt); + wmsum_fini(&abd_sums.abdstat_linear_data_size); + + kmem_cache_destroy(abd_chunk_cache); abd_chunk_cache = NULL; } @@ -323,27 +534,64 @@ abd_alloc_for_io(size_t size, boolean_t is_metadata) return (abd_alloc_linear(size, is_metadata)); } + +/* + * return an ABD structure that peers into source ABD sabd. The returned ABD + * may be new, or the one supplied as abd. abd and sabd must point to one or + * more zfs_abd_chunk_size (ABD_PGSIZE) chunks, or point to one and exactly one + * smaller chunk. + * + * The [off, off+size] range must be found within (and thus + * fit within) the source ABD. + */ + abd_t * abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, size_t size) { abd_verify(sabd); VERIFY3U(off, <=, sabd->abd_size); - size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; + const uint_t sabd_chunksz = ABD_SCATTER(sabd).abd_chunk_size; + + const size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; + + /* subpage ABD range checking */ + if (sabd_chunksz != zfs_abd_chunk_size) { + /* off+size must fit in 1 chunk */ + VERIFY3U(off + size, <=, sabd_chunksz); + /* new_offset must be in bounds of 1 chunk */ + VERIFY3U(new_offset, <=, sabd_chunksz); + /* new_offset + size must be in bounds of 1 chunk */ + VERIFY3U(new_offset + size, <=, sabd_chunksz); + } /* * chunkcnt is abd_chunkcnt_for_bytes(size), which rounds * up to the nearest chunk, but we also must take care * of the offset *in the leading chunk* */ - size_t chunkcnt = abd_chunkcnt_for_bytes( - (new_offset % zfs_abd_chunk_size) + size); + const size_t chunkcnt = (sabd_chunksz != zfs_abd_chunk_size) + ? 1 + : abd_chunkcnt_for_bytes((new_offset % sabd_chunksz) + size); + /* sanity checks on chunkcnt */ VERIFY3U(chunkcnt, <=, abd_scatter_chunkcnt(sabd)); + VERIFY3U(chunkcnt, >, 0); + + /* non-subpage sanity checking */ + if (chunkcnt > 1) { + /* compare with legacy calculation of chunkcnt */ + VERIFY3U(chunkcnt, ==, abd_chunkcnt_for_bytes( + P2PHASE(new_offset, zfs_abd_chunk_size) + size)); + /* EITHER subpage chunk (singular) or std chunks */ + VERIFY3U(sabd_chunksz, ==, zfs_abd_chunk_size); + } /* - * If an abd struct is provided, it is only the minimum size. If we - * need additional chunks, we need to allocate a new struct. + * If an abd struct is provided, it is only the minimum size (and + * almost certainly provided as an abd_t embedded in a larger + * structure). If we need additional chunks, we need to allocate a + * new struct. */ if (abd != NULL && offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]) > @@ -352,7 +600,7 @@ abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, size_t size) } if (abd == NULL) - abd = abd_alloc_struct(chunkcnt * zfs_abd_chunk_size); + abd = abd_alloc_struct(chunkcnt * sabd_chunksz); /* * Even if this buf is filesystem metadata, we only track that @@ -360,13 +608,24 @@ abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, size_t size) * this case. Therefore, we don't ever use ABD_FLAG_META here. */ - ABD_SCATTER(abd).abd_offset = new_offset % zfs_abd_chunk_size; - ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size; + /* update offset, and sanity check it */ + ABD_SCATTER(abd).abd_offset = new_offset % sabd_chunksz; + + VERIFY3U(ABD_SCATTER(abd).abd_offset, <, sabd_chunksz); + VERIFY3U(ABD_SCATTER(abd).abd_offset + size, <=, + chunkcnt * sabd_chunksz); + + ABD_SCATTER(abd).abd_chunk_size = sabd_chunksz; + + if (chunkcnt > 1) { + VERIFY3U(ABD_SCATTER(sabd).abd_chunk_size, ==, + zfs_abd_chunk_size); + } /* Copy the scatterlist starting at the correct offset */ (void) memcpy(&ABD_SCATTER(abd).abd_chunks, &ABD_SCATTER(sabd).abd_chunks[new_offset / - zfs_abd_chunk_size], + sabd_chunksz], chunkcnt * sizeof (void *)); return (abd); @@ -377,15 +636,16 @@ abd_iter_scatter_chunk_offset(struct abd_iter *aiter) { ASSERT(!abd_is_linear(aiter->iter_abd)); return ((ABD_SCATTER(aiter->iter_abd).abd_offset + - aiter->iter_pos) % zfs_abd_chunk_size); + aiter->iter_pos) % + ABD_SCATTER(aiter->iter_abd).abd_chunk_size); } static inline size_t abd_iter_scatter_chunk_index(struct abd_iter *aiter) { ASSERT(!abd_is_linear(aiter->iter_abd)); - return ((ABD_SCATTER(aiter->iter_abd).abd_offset + - aiter->iter_pos) / zfs_abd_chunk_size); + return ((ABD_SCATTER(aiter->iter_abd).abd_offset + aiter->iter_pos) + / ABD_SCATTER(aiter->iter_abd).abd_chunk_size); } /* @@ -443,9 +703,30 @@ abd_iter_map(struct abd_iter *aiter) ASSERT3P(aiter->iter_mapaddr, ==, NULL); ASSERT0(aiter->iter_mapsize); +#if 0 /* Panic if someone has changed zfs_abd_chunk_size */ + IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size == ABD_SCATTER(aiter->iter_abd).abd_chunk_size); +#else + /* + * If scattered, VERIFY that we are using ABD_PGSIZE chunks, or we have + * one and only one chunk of less than ABD_PGSIZE. + */ + + if (!abd_is_linear(aiter->iter_abd)) { + if (ABD_SCATTER(aiter->iter_abd).abd_chunk_size != + zfs_abd_chunk_size) { + VERIFY3U( + ABD_SCATTER(aiter->iter_abd).abd_chunk_size, + <, zfs_abd_chunk_size); + VERIFY3U(aiter->iter_abd->abd_size, + <, zfs_abd_chunk_size); + VERIFY3U(aiter->iter_abd->abd_size, + <=, ABD_SCATTER(aiter->iter_abd).abd_chunk_size); + } + } +#endif /* There's nothing left to iterate over, so do nothing */ if (abd_iter_at_end(aiter)) @@ -457,8 +738,12 @@ abd_iter_map(struct abd_iter *aiter) paddr = ABD_LINEAR_BUF(aiter->iter_abd); } else { size_t index = abd_iter_scatter_chunk_index(aiter); + IMPLY(ABD_SCATTER(aiter->iter_abd).abd_chunk_size != ABD_PGSIZE, + index == 0); offset = abd_iter_scatter_chunk_offset(aiter); - aiter->iter_mapsize = MIN(zfs_abd_chunk_size - offset, + aiter->iter_mapsize = MIN( + ABD_SCATTER(aiter->iter_abd).abd_chunk_size + - offset, aiter->iter_abd->abd_size - aiter->iter_pos); paddr = ABD_SCATTER(aiter->iter_abd).abd_chunks[index]; } @@ -472,12 +757,10 @@ abd_iter_map(struct abd_iter *aiter) void abd_iter_unmap(struct abd_iter *aiter) { - /* There's nothing left to unmap, so do nothing */ - if (abd_iter_at_end(aiter)) - return; - - ASSERT3P(aiter->iter_mapaddr, !=, NULL); - ASSERT3U(aiter->iter_mapsize, >, 0); + if (!abd_iter_at_end(aiter)) { + ASSERT3P(aiter->iter_mapaddr, !=, NULL); + ASSERT3U(aiter->iter_mapsize, >, 0); + } aiter->iter_mapaddr = NULL; aiter->iter_mapsize = 0; @@ -486,5 +769,20 @@ abd_iter_unmap(struct abd_iter *aiter) void abd_cache_reap_now(void) { - // do nothing + /* + * This function is called by arc_kmem_reap_soon(), which also invokes + * kmem_cache_reap_now() on several other kmem caches. + * + * kmem_cache_reap_now() now operates on all kmem caches at each + * invocation (ignoring its kmem_cache_t argument except for an ASSERT + * in DEBUG builds) by invoking kmem_reap(). Previously + * kmem_cache_reap_now() would clearing the caches magazine working + * set and starting a reap immediately and without regard to the + * kmem_reaping compare-and-swap flag. + * + * Previously in this function we would call kmem_cache_reap_now() for + * each of the abd_chunk and subpage kmem caches. Now, since this + * function is called after several kmem_cache_reap_now(), it + * can be a noop. + */ } diff --git a/module/os/windows/zfs/arc_os.c b/module/os/windows/zfs/arc_os.c index 65a2ab075f42..e586b87b5581 100644 --- a/module/os/windows/zfs/arc_os.c +++ b/module/os/windows/zfs/arc_os.c @@ -826,3 +826,11 @@ void arc_unregister_hotplug(void) { } + +void +spl_set_arc_no_grow(int i) +{ + arc_no_grow = i; + if (i == B_TRUE) + membar_producer(); /* make it visible to other threads */ +}