diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index 9b50a4a3d96e..f3d4316f6f67 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -14,7 +14,7 @@ Please check our issue tracker before opening a new feature request. Filling out the following template will help other contributors better understand your proposed feature. --> -### Describe the feature would like to see added to OpenZFS +### Describe the feature you would like to see added to OpenZFS - - ### Motivation and Context diff --git a/.github/codeql-cpp.yml b/.github/codeql-cpp.yml index 88b8c6086025..d99cdb559244 100644 --- a/.github/codeql-cpp.yml +++ b/.github/codeql-cpp.yml @@ -2,3 +2,4 @@ name: "Custom CodeQL Analysis" queries: - uses: ./.github/codeql/custom-queries/cpp/deprecatedFunctionUsage.ql + - uses: ./.github/codeql/custom-queries/cpp/dslDatasetHoldReleMismatch.ql diff --git a/.github/codeql/custom-queries/cpp/dslDatasetHoldReleMismatch.ql b/.github/codeql/custom-queries/cpp/dslDatasetHoldReleMismatch.ql new file mode 100644 index 000000000000..fb5dae35092f --- /dev/null +++ b/.github/codeql/custom-queries/cpp/dslDatasetHoldReleMismatch.ql @@ -0,0 +1,34 @@ +/** + * @name Detect mismatched dsl_dataset_hold/_rele pairs + * @description Flags instances of issue #12014 where + * - a dataset held with dsl_dataset_hold_obj() ends up in dsl_dataset_rele_flags(), or + * - a dataset held with dsl_dataset_hold_obj_flags() ends up in dsl_dataset_rele(). + * @kind problem + * @severity error + * @tags correctness + * @id cpp/dslDatasetHoldReleMismatch + */ + +import cpp + +from Variable ds, Call holdCall, Call releCall, string message +where + ds.getType().toString() = "dsl_dataset_t *" and + holdCall.getASuccessor*() = releCall and + ( + (holdCall.getTarget().getName() = "dsl_dataset_hold_obj_flags" and + holdCall.getArgument(4).(AddressOfExpr).getOperand().(VariableAccess).getTarget() = ds and + releCall.getTarget().getName() = "dsl_dataset_rele" and + releCall.getArgument(0).(VariableAccess).getTarget() = ds and + message = "Held with dsl_dataset_hold_obj_flags but released with dsl_dataset_rele") + or + (holdCall.getTarget().getName() = "dsl_dataset_hold_obj" and + holdCall.getArgument(3).(AddressOfExpr).getOperand().(VariableAccess).getTarget() = ds and + releCall.getTarget().getName() = "dsl_dataset_rele_flags" and + releCall.getArgument(0).(VariableAccess).getTarget() = ds and + message = "Held with dsl_dataset_hold_obj but released with dsl_dataset_rele_flags") + ) +select releCall, + "Mismatched release: held with $@ but released with " + releCall.getTarget().getName() + " for dataset $@", + holdCall, holdCall.getTarget().getName(), + ds, ds.toString() diff --git a/.github/workflows/scripts/generate-ci-type.py b/.github/workflows/scripts/generate-ci-type.py index b49255e8381d..08021aabcb61 100755 --- a/.github/workflows/scripts/generate-ci-type.py +++ b/.github/workflows/scripts/generate-ci-type.py @@ -65,7 +65,7 @@ def output_type(type, reason): # check last (HEAD) commit message last_commit_message_raw = subprocess.run([ - 'git', 'show', '-s', '--format=%B', 'HEAD' + 'git', 'show', '-s', '--format=%B', head ], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) for line in last_commit_message_raw.stdout.decode().splitlines(): diff --git a/.github/workflows/scripts/qemu-1-setup.sh b/.github/workflows/scripts/qemu-1-setup.sh index de29ad1f57b6..0278264d9279 100755 --- a/.github/workflows/scripts/qemu-1-setup.sh +++ b/.github/workflows/scripts/qemu-1-setup.sh @@ -6,6 +6,13 @@ set -eu +# We've been seeing this script take over 15min to run. This may or +# may not be normal. Just to get a little more insight, print out +# a message to stdout with the top running process, and do this every +# 30 seconds. We can delete this watchdog later once we get a better +# handle on what the timeout value should be. +(while [ 1 ] ; do sleep 30 && echo "[watchdog: $(ps -eo cmd --sort=-pcpu | head -n 2 | tail -n 1)}')]"; done) & + # install needed packages export DEBIAN_FRONTEND="noninteractive" sudo apt-get -y update @@ -65,3 +72,6 @@ sudo zpool create -f -o ashift=12 zpool $SSD1 $SSD2 -O relatime=off \ for i in /sys/block/s*/queue/scheduler; do echo "none" | sudo tee $i done + +# Kill off our watchdog +kill $(jobs -p) diff --git a/.github/workflows/scripts/qemu-2-start.sh b/.github/workflows/scripts/qemu-2-start.sh index 28da6700e541..422b3e9df388 100755 --- a/.github/workflows/scripts/qemu-2-start.sh +++ b/.github/workflows/scripts/qemu-2-start.sh @@ -12,10 +12,10 @@ OS="$1" # OS variant (virt-install --os-variant list) OSv=$OS -# compressed with .zst extension -REPO="https://github.com/mcmilk/openzfs-freebsd-images" -FREEBSD="$REPO/releases/download/v2025-04-13" -URLzs="" +# FreeBSD urls's +FREEBSD_REL="https://download.freebsd.org/releases/CI-IMAGES" +FREEBSD_SNAP="https://download.freebsd.org/snapshots/CI-IMAGES" +URLxz="" # Ubuntu mirrors UBMIRROR="https://cloud-images.ubuntu.com" @@ -25,6 +25,10 @@ UBMIRROR="https://cloud-images.ubuntu.com" # default nic model for vm's NIC="virtio" +# additional options for virt-install +OPTS[0]="" +OPTS[1]="" + case "$OS" in almalinux8) OSNAME="AlmaLinux 8" @@ -61,6 +65,14 @@ case "$OS" in OSNAME="Debian 12" URL="https://cloud.debian.org/images/cloud/bookworm/latest/debian-12-generic-amd64.qcow2" ;; + debian13) + OSNAME="Debian 13" + # TODO: Overwrite OSv to debian13 for virt-install until it's added to osinfo + OSv="debian12" + URL="https://cloud.debian.org/images/cloud/trixie/latest/debian-13-generic-amd64.qcow2" + OPTS[0]="--boot" + OPTS[1]="uefi=on" + ;; fedora41) OSNAME="Fedora 41" OSv="fedora-unknown" @@ -71,50 +83,56 @@ case "$OS" in OSv="fedora-unknown" URL="https://download.fedoraproject.org/pub/fedora/linux/releases/42/Cloud/x86_64/images/Fedora-Cloud-Base-Generic-42-1.1.x86_64.qcow2" ;; - freebsd13-4r) - OSNAME="FreeBSD 13.4-RELEASE" - OSv="freebsd13.0" - URLzs="$FREEBSD/amd64-freebsd-13.4-RELEASE.qcow2.zst" - BASH="/usr/local/bin/bash" - NIC="rtl8139" - ;; freebsd13-5r) - OSNAME="FreeBSD 13.5-RELEASE" + FreeBSD="13.5-RELEASE" + OSNAME="FreeBSD $FreeBSD" OSv="freebsd13.0" - URLzs="$FREEBSD/amd64-freebsd-13.5-RELEASE.qcow2.zst" - BASH="/usr/local/bin/bash" + URLxz="$FREEBSD_REL/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI.raw.xz" + KSRC="$FREEBSD_REL/../amd64/$FreeBSD/src.txz" NIC="rtl8139" ;; - freebsd14-1r) - OSNAME="FreeBSD 14.1-RELEASE" + freebsd14-2r) + FreeBSD="14.2-RELEASE" + OSNAME="FreeBSD $FreeBSD" OSv="freebsd14.0" - URLzs="$FREEBSD/amd64-freebsd-14.1-RELEASE.qcow2.zst" - BASH="/usr/local/bin/bash" + KSRC="$FREEBSD_REL/../amd64/$FreeBSD/src.txz" + URLxz="$FREEBSD_REL/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI.raw.xz" ;; - freebsd14-2r) - OSNAME="FreeBSD 14.2-RELEASE" + freebsd14-3r) + FreeBSD="14.3-RELEASE" + OSNAME="FreeBSD $FreeBSD" OSv="freebsd14.0" - URLzs="$FREEBSD/amd64-freebsd-14.2-RELEASE.qcow2.zst" - BASH="/usr/local/bin/bash" + URLxz="$FREEBSD_REL/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI.raw.xz" + KSRC="$FREEBSD_REL/../amd64/$FreeBSD/src.txz" ;; freebsd13-5s) - OSNAME="FreeBSD 13.5-STABLE" + FreeBSD="13.5-STABLE" + OSNAME="FreeBSD $FreeBSD" OSv="freebsd13.0" - URLzs="$FREEBSD/amd64-freebsd-13.5-STABLE.qcow2.zst" - BASH="/usr/local/bin/bash" + URLxz="$FREEBSD_SNAP/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI.raw.xz" + KSRC="$FREEBSD_SNAP/../amd64/$FreeBSD/src.txz" NIC="rtl8139" ;; - freebsd14-2s) - OSNAME="FreeBSD 14.2-STABLE" + freebsd14-3s) + FreeBSD="14.3-STABLE" + OSNAME="FreeBSD $FreeBSD" OSv="freebsd14.0" - URLzs="$FREEBSD/amd64-freebsd-14.2-STABLE.qcow2.zst" - BASH="/usr/local/bin/bash" + URLxz="$FREEBSD_SNAP/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI-ufs.raw.xz" + KSRC="$FREEBSD_SNAP/../amd64/$FreeBSD/src.txz" ;; freebsd15-0c) - OSNAME="FreeBSD 15.0-CURRENT" + FreeBSD="15.0-ALPHA4" + OSNAME="FreeBSD $FreeBSD" + OSv="freebsd14.0" + URLxz="$FREEBSD_SNAP/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI-ufs.raw.xz" + KSRC="$FREEBSD_SNAP/../amd64/$FreeBSD/src.txz" + ;; + freebsd16-0c) + FreeBSD="16.0-CURRENT" + OSNAME="FreeBSD $FreeBSD" OSv="freebsd14.0" - URLzs="$FREEBSD/amd64-freebsd-15.0-CURRENT.qcow2.zst" - BASH="/usr/local/bin/bash" + URLxz="$FREEBSD_SNAP/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI-ufs.raw.xz" + KSRC="$FREEBSD_SNAP/../amd64/$FreeBSD/src.txz" ;; tumbleweed) OSNAME="openSUSE Tumbleweed" @@ -168,31 +186,37 @@ echo "CPU=\"$CPU\"" >> $ENV sudo mkdir -p "/mnt/tests" sudo chown -R $(whoami) /mnt/tests +DISK="/dev/zvol/zpool/openzfs" +sudo zfs create -ps -b 64k -V 80g zpool/openzfs +while true; do test -b $DISK && break; sleep 1; done + # we are downloading via axel, curl and wget are mostly slower and # require more return value checking -IMG="/mnt/tests/cloudimg.qcow2" -if [ ! -z "$URLzs" ]; then - echo "Loading image $URLzs ..." - time axel -q -o "$IMG.zst" "$URLzs" - zstd -q -d --rm "$IMG.zst" +IMG="/mnt/tests/cloud-image" +if [ ! -z "$URLxz" ]; then + echo "Loading $URLxz ..." + time axel -q -o "$IMG" "$URLxz" + echo "Loading $KSRC ..." + time axel -q -o ~/src.txz $KSRC else - echo "Loading image $URL ..." + echo "Loading $URL ..." time axel -q -o "$IMG" "$URL" fi -DISK="/dev/zvol/zpool/openzfs" -FORMAT="raw" -sudo zfs create -ps -b 64k -V 80g zpool/openzfs -while true; do test -b $DISK && break; sleep 1; done echo "Importing VM image to zvol..." -sudo qemu-img dd -f qcow2 -O raw if=$IMG of=$DISK bs=4M +if [ ! -z "$URLxz" ]; then + xzcat -T0 $IMG | sudo dd of=$DISK bs=4M +else + sudo qemu-img dd -f qcow2 -O raw if=$IMG of=$DISK bs=4M +fi rm -f $IMG PUBKEY=$(cat ~/.ssh/id_ed25519.pub) -cat < /tmp/user-data +if [ ${OS:0:7} != "freebsd" ]; then + cat < /tmp/user-data #cloud-config -fqdn: $OS +hostname: $OS users: - name: root @@ -208,6 +232,19 @@ growpart: devices: ['/'] ignore_growroot_disabled: false EOF +else + cat < /tmp/user-data +#cloud-config + +hostname: $OS + +# minimized config without sudo for nuageinit of FreeBSD +growpart: + mode: auto + devices: ['/'] + ignore_growroot_disabled: false +EOF +fi sudo virsh net-update default add ip-dhcp-host \ "" --live --config @@ -223,15 +260,8 @@ sudo virt-install \ --graphics none \ --network bridge=virbr0,model=$NIC,mac='52:54:00:83:79:00' \ --cloud-init user-data=/tmp/user-data \ - --disk $DISK,bus=virtio,cache=none,format=$FORMAT,driver.discard=unmap \ - --import --noautoconsole >/dev/null - -# enable KSM on Linux -if [ ${OS:0:7} != "freebsd" ]; then - sudo virsh dommemstat --domain "openzfs" --period 5 - sudo virsh node-memory-tune 100 50 1 - echo 1 | sudo tee /sys/kernel/mm/ksm/run > /dev/null -fi + --disk $DISK,bus=virtio,cache=none,format=raw,driver.discard=unmap \ + --import --noautoconsole ${OPTS[0]} ${OPTS[1]} >/dev/null # Give the VMs hostnames so we don't have to refer to them with # hardcoded IP addresses. @@ -252,3 +282,29 @@ StrictHostKeyChecking no # small timeout, used in while loops later ConnectTimeout 1 EOF + +if [ ${OS:0:7} != "freebsd" ]; then + # enable KSM on Linux + sudo virsh dommemstat --domain "openzfs" --period 5 + sudo virsh node-memory-tune 100 50 1 + echo 1 | sudo tee /sys/kernel/mm/ksm/run > /dev/null +else + # on FreeBSD we need some more init stuff, because of nuageinit + BASH="/usr/local/bin/bash" + while pidof /usr/bin/qemu-system-x86_64 >/dev/null; do + ssh 2>/dev/null root@vm0 "uname -a" && break + done + ssh root@vm0 "env IGNORE_OSVERSION=yes pkg install -y bash ca_root_nss git qemu-guest-agent python3 py311-cloud-init" + ssh root@vm0 "chsh -s $BASH root" + ssh root@vm0 'sysrc qemu_guest_agent_enable="YES"' + ssh root@vm0 'sysrc cloudinit_enable="YES"' + ssh root@vm0 "pw add user zfs -w no -s $BASH" + ssh root@vm0 'mkdir -p ~zfs/.ssh' + ssh root@vm0 'echo "zfs ALL=(ALL:ALL) NOPASSWD: ALL" >> /usr/local/etc/sudoers' + ssh root@vm0 'echo "PubkeyAuthentication yes" >> /etc/ssh/sshd_config' + scp ~/.ssh/id_ed25519.pub "root@vm0:~zfs/.ssh/authorized_keys" + ssh root@vm0 'chown -R zfs ~zfs' + ssh root@vm0 'service sshd restart' + scp ~/src.txz "root@vm0:/tmp/src.txz" + ssh root@vm0 'tar -C / -zxf /tmp/src.txz' +fi diff --git a/.github/workflows/scripts/qemu-3-deps-vm.sh b/.github/workflows/scripts/qemu-3-deps-vm.sh index a581b13c2f58..f67bb2f68e94 100755 --- a/.github/workflows/scripts/qemu-3-deps-vm.sh +++ b/.github/workflows/scripts/qemu-3-deps-vm.sh @@ -20,7 +20,7 @@ function archlinux() { sudo pacman -Sy --noconfirm base-devel bc cpio cryptsetup dhclient dkms \ fakeroot fio gdb inetutils jq less linux linux-headers lsscsi nfs-utils \ parted pax perf python-packaging python-setuptools qemu-guest-agent ksh \ - samba sysstat rng-tools rsync wget xxhash + samba strace sysstat rng-tools rsync wget xxhash echo "##[endgroup]" } @@ -28,6 +28,7 @@ function debian() { export DEBIAN_FRONTEND="noninteractive" echo "##[group]Running apt-get update+upgrade" + sudo sed -i '/[[:alpha:]]-backports/d' /etc/apt/sources.list sudo apt-get update -y sudo apt-get upgrade -y echo "##[endgroup]" @@ -40,9 +41,10 @@ function debian() { libelf-dev libffi-dev libmount-dev libpam0g-dev libselinux-dev libssl-dev \ libtool libtool-bin libudev-dev libunwind-dev linux-headers-$(uname -r) \ lsscsi nfs-kernel-server pamtester parted python3 python3-all-dev \ - python3-cffi python3-dev python3-distlib python3-packaging \ + python3-cffi python3-dev python3-distlib python3-packaging libtirpc-dev \ python3-setuptools python3-sphinx qemu-guest-agent rng-tools rpm2cpio \ - rsync samba sysstat uuid-dev watchdog wget xfslibs-dev xxhash zlib1g-dev + rsync samba strace sysstat uuid-dev watchdog wget xfslibs-dev xxhash \ + zlib1g-dev echo "##[endgroup]" } @@ -51,7 +53,7 @@ function freebsd() { echo "##[group]Install Development Tools" sudo pkg install -y autoconf automake autotools base64 checkbashisms fio \ - gdb gettext gettext-runtime git gmake gsed jq ksh93 lcov libtool lscpu \ + gdb gettext gettext-runtime git gmake gsed jq ksh lcov libtool lscpu \ pkgconf python python3 pamtester pamtester qemu-guest-agent rsync xxhash sudo pkg install -xy \ '^samba4[[:digit:]]+$' \ @@ -86,8 +88,8 @@ function rhel() { libuuid-devel lsscsi mdadm nfs-utils openssl-devel pam-devel pamtester \ parted perf python3 python3-cffi python3-devel python3-packaging \ kernel-devel python3-setuptools qemu-guest-agent rng-tools rpcgen \ - rpm-build rsync samba sysstat systemd watchdog wget xfsprogs-devel xxhash \ - zlib-devel + rpm-build rsync samba strace sysstat systemd watchdog wget xfsprogs-devel \ + xxhash zlib-devel echo "##[endgroup]" } @@ -103,7 +105,7 @@ function install_fedora_experimental_kernel { our_version="$1" sudo dnf -y copr enable @kernel-vanilla/stable sudo dnf -y copr enable @kernel-vanilla/mainline - all="$(sudo dnf list --showduplicates kernel-*)" + all="$(sudo dnf list --showduplicates kernel-* python3-perf* perf* bpftool*)" echo "Available versions:" echo "$all" diff --git a/.github/workflows/scripts/qemu-4-build-vm.sh b/.github/workflows/scripts/qemu-4-build-vm.sh index 17e976ebcc39..2807d9e77127 100755 --- a/.github/workflows/scripts/qemu-4-build-vm.sh +++ b/.github/workflows/scripts/qemu-4-build-vm.sh @@ -5,12 +5,13 @@ # # Usage: # -# qemu-4-build-vm.sh OS [--enable-debug][--dkms][--poweroff] -# [--release][--repo][--tarball] +# qemu-4-build-vm.sh OS [--enable-debug][--dkms][--patch-level NUM] +# [--poweroff][--release][--repo][--tarball] # # OS: OS name like 'fedora41' # --enable-debug: Build RPMs with '--enable-debug' (for testing) # --dkms: Build DKMS RPMs as well +# --patch-level NUM: Use a custom patch level number for packages. # --poweroff: Power-off the VM after building # --release Build zfs-release*.rpm as well # --repo After building everything, copy RPMs into /tmp/repo @@ -21,6 +22,7 @@ ENABLE_DEBUG="" DKMS="" +PATCH_LEVEL="" POWEROFF="" RELEASE="" REPO="" @@ -35,6 +37,11 @@ while [[ $# -gt 0 ]]; do DKMS=1 shift ;; + --patch-level) + PATCH_LEVEL=$2 + shift + shift + ;; --poweroff) POWEROFF=1 shift @@ -215,6 +222,10 @@ function rpm_build_and_install() { run ./autogen.sh echo "##[endgroup]" + if [ -n "$PATCH_LEVEL" ] ; then + sed -i -E 's/(Release:\s+)1/\1'$PATCH_LEVEL'/g' META + fi + echo "##[group]Configure" run ./configure --enable-debuginfo $extra echo "##[endgroup]" @@ -328,7 +339,13 @@ fi # almalinux9.5 # fedora42 source /etc/os-release -sudo hostname "$ID$VERSION_ID" + if which hostnamectl &> /dev/null ; then + # Fedora 42+ use hostnamectl + sudo hostnamectl set-hostname "$ID$VERSION_ID" + sudo hostnamectl set-hostname --pretty "$ID$VERSION_ID" +else + sudo hostname "$ID$VERSION_ID" +fi # save some sysinfo uname -a > /var/tmp/uname.txt diff --git a/.github/workflows/scripts/qemu-5-setup.sh b/.github/workflows/scripts/qemu-5-setup.sh index 6bf10024a1a6..4869c1003e48 100755 --- a/.github/workflows/scripts/qemu-5-setup.sh +++ b/.github/workflows/scripts/qemu-5-setup.sh @@ -12,16 +12,26 @@ source /var/tmp/env.txt # wait for poweroff to succeed PID=$(pidof /usr/bin/qemu-system-x86_64) tail --pid=$PID -f /dev/null -sudo virsh undefine openzfs +sudo virsh undefine --nvram openzfs # cpu pinning CPUSET=("0,1" "2,3") +# additional options for virt-install +OPTS[0]="" +OPTS[1]="" + case "$OS" in freebsd*) # FreeBSD needs only 6GiB RAM=6 ;; + debian13) + RAM=8 + # Boot Debian 13 with uefi=on and secureboot=off (ZFS Kernel Module not signed) + OPTS[0]="--boot" + OPTS[1]="firmware=efi,firmware.feature0.name=secure-boot,firmware.feature0.enabled=no" + ;; *) # Linux needs more memory, but can be optimized to share it via KSM RAM=8 @@ -79,7 +89,7 @@ EOF --network bridge=virbr0,model=$NIC,mac="52:54:00:83:79:0$i" \ --disk $DISK-system,bus=virtio,cache=none,format=$FORMAT,driver.discard=unmap \ --disk $DISK-tests,bus=virtio,cache=none,format=$FORMAT,driver.discard=unmap \ - --import --noautoconsole >/dev/null + --import --noautoconsole ${OPTS[0]} ${OPTS[1]} done # generate some memory stats @@ -98,19 +108,30 @@ echo '*/5 * * * * /root/cronjob.sh' > crontab.txt sudo crontab crontab.txt rm crontab.txt -# check if the machines are okay -echo "Waiting for vm's to come up... (${VMs}x CPU=$CPU RAM=$RAM)" -for ((i=1; i<=VMs; i++)); do - .github/workflows/scripts/qemu-wait-for-vm.sh vm$i -done -echo "All $VMs VMs are up now." - # Save the VM's serial output (ttyS0) to /var/tmp/console.txt # - ttyS0 on the VM corresponds to a local /dev/pty/N entry # - use 'virsh ttyconsole' to lookup the /dev/pty/N entry for ((i=1; i<=VMs; i++)); do mkdir -p $RESPATH/vm$i read "pty" <<< $(sudo virsh ttyconsole vm$i) + + # Create the file so we can tail it, even if there's no output. + touch $RESPATH/vm$i/console.txt + sudo nohup bash -c "cat $pty > $RESPATH/vm$i/console.txt" & + + # Write all VM boot lines to the console to aid in debugging failed boots. + # The boot lines from all the VMs will be munged together, so prepend each + # line with the vm hostname (like 'vm1:'). + (while IFS=$'\n' read -r line; do echo "vm$i: $line" ; done < <(sudo tail -f $RESPATH/vm$i/console.txt)) & + done echo "Console logging for ${VMs}x $OS started." + + +# check if the machines are okay +echo "Waiting for vm's to come up... (${VMs}x CPU=$CPU RAM=$RAM)" +for ((i=1; i<=VMs; i++)); do + .github/workflows/scripts/qemu-wait-for-vm.sh vm$i +done +echo "All $VMs VMs are up now." diff --git a/.github/workflows/scripts/qemu-6-tests.sh b/.github/workflows/scripts/qemu-6-tests.sh index e8e6adecd62f..ca6ac77f146d 100755 --- a/.github/workflows/scripts/qemu-6-tests.sh +++ b/.github/workflows/scripts/qemu-6-tests.sh @@ -21,11 +21,13 @@ function prefix() { S=$((DIFF-(M*60))) CTR=$(cat /tmp/ctr) - echo $LINE| grep -q "^Test[: ]" && CTR=$((CTR+1)) && echo $CTR > /tmp/ctr + echo $LINE| grep -q '^\[.*] Test[: ]' && CTR=$((CTR+1)) && echo $CTR > /tmp/ctr BASE="$HOME/work/zfs/zfs" COLOR="$BASE/scripts/zfs-tests-color.sh" - CLINE=$(echo $LINE| grep "^Test[ :]" | sed -e 's|/usr/local|/usr|g' \ + CLINE=$(echo $LINE| grep '^\[.*] Test[: ]' \ + | sed -e 's|^\[.*] Test|Test|g' \ + | sed -e 's|/usr/local|/usr|g' \ | sed -e 's| /usr/share/zfs/zfs-tests/tests/| |g' | $COLOR) if [ -z "$CLINE" ]; then printf "vm${ID}: %s\n" "$LINE" @@ -109,7 +111,7 @@ fi sudo dmesg -c > dmesg-prerun.txt mount > mount.txt df -h > df-prerun.txt -$TDIR/zfs-tests.sh -vK -s 3GB -T $TAGS +$TDIR/zfs-tests.sh -vKO -s 3GB -T $TAGS RV=$? df -h > df-postrun.txt echo $RV > tests-exitcode.txt diff --git a/.github/workflows/zfs-qemu-packages.yml b/.github/workflows/zfs-qemu-packages.yml index 5b5afe746859..d8a95954fe1a 100644 --- a/.github/workflows/zfs-qemu-packages.yml +++ b/.github/workflows/zfs-qemu-packages.yml @@ -32,6 +32,11 @@ on: options: - "Build RPMs" - "Test repo" + patch_level: + type: string + required: false + default: "" + description: "(optional) patch level number" repo_url: type: string required: false @@ -78,7 +83,13 @@ jobs: mkdir -p /tmp/repo ssh zfs@vm0 '$HOME/zfs/.github/workflows/scripts/qemu-test-repo-vm.sh' ${{ github.event.inputs.repo_url }} else - .github/workflows/scripts/qemu-4-build.sh --repo --release --dkms --tarball ${{ matrix.os }} + EXTRA="" + if [ -n "${{ github.event.inputs.patch_level }}" ] ; then + EXTRA="--patch-level ${{ github.event.inputs.patch_level }}" + fi + + .github/workflows/scripts/qemu-4-build.sh $EXTRA \ + --repo --release --dkms --tarball ${{ matrix.os }} fi - name: Prepare artifacts diff --git a/.github/workflows/zfs-qemu.yml b/.github/workflows/zfs-qemu.yml index 1d9899ae895f..3b164548f9be 100644 --- a/.github/workflows/zfs-qemu.yml +++ b/.github/workflows/zfs-qemu.yml @@ -5,16 +5,6 @@ on: pull_request: workflow_dispatch: inputs: - include_stream9: - type: boolean - required: false - default: false - description: 'Test on CentOS 9 stream' - include_stream10: - type: boolean - required: false - default: false - description: 'Test on CentOS 10 stream' fedora_kernel_ver: type: string required: false @@ -39,8 +29,8 @@ jobs: - name: Generate OS config and CI type id: os run: | - FULL_OS='["almalinux8", "almalinux9", "almalinux10", "debian11", "debian12", "fedora41", "fedora42", "freebsd13-4r", "freebsd14-2s", "freebsd15-0c", "ubuntu22", "ubuntu24"]' - QUICK_OS='["almalinux8", "almalinux9", "almalinux10", "debian12", "fedora42", "freebsd14-2r", "ubuntu24"]' + FULL_OS='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian12", "debian13", "fedora41", "fedora42", "freebsd13-5r", "freebsd14-3s", "freebsd15-0c", "ubuntu22", "ubuntu24"]' + QUICK_OS='["almalinux8", "almalinux9", "almalinux10", "debian12", "fedora42", "freebsd14-3s", "ubuntu24"]' # determine CI type when running on PR ci_type="full" if ${{ github.event_name == 'pull_request' }}; then @@ -54,7 +44,7 @@ jobs: os_selection="$FULL_OS" fi - if [ ${{ github.event.inputs.fedora_kernel_ver }} != "" ] ; then + if ${{ github.event.inputs.fedora_kernel_ver != '' }}; then # They specified a custom kernel version for Fedora. Use only # Fedora runners. os_json=$(echo ${os_selection} | jq -c '[.[] | select(startswith("fedora"))]') @@ -63,17 +53,8 @@ jobs: os_json=$(echo ${os_selection} | jq -c) fi - # Add optional runners - if [ "${{ github.event.inputs.include_stream9 }}" == 'true' ]; then - os_json=$(echo $os_json | jq -c '. += ["centos-stream9"]') - fi - if [ "${{ github.event.inputs.include_stream10 }}" == 'true' ]; then - os_json=$(echo $os_json | jq -c '. += ["centos-stream10"]') - fi - - echo $os_json - echo "os=$os_json" >> $GITHUB_OUTPUT - echo "ci_type=$ci_type" >> $GITHUB_OUTPUT + echo "os=$os_json" | tee -a $GITHUB_OUTPUT + echo "ci_type=$ci_type" | tee -a $GITHUB_OUTPUT qemu-vm: name: qemu-x86 @@ -81,13 +62,13 @@ jobs: strategy: fail-fast: false matrix: - # rhl: almalinux8, almalinux9, centos-stream9, fedora41 - # debian: debian11, debian12, ubuntu22, ubuntu24 + # rhl: almalinux8, almalinux9, centos-stream9, fedora4x + # debian: debian12, debian13, ubuntu22, ubuntu24 # misc: archlinux, tumbleweed - # FreeBSD variants of 2024-12: - # FreeBSD Release: freebsd13-4r, freebsd14-2r - # FreeBSD Stable: freebsd13-4s, freebsd14-2s - # FreeBSD Current: freebsd15-0c + # FreeBSD variants of 2025-06: + # FreeBSD Release: freebsd13-5r, freebsd14-2r, freebsd14-3r + # FreeBSD Stable: freebsd13-5s, freebsd14-3s + # FreeBSD Current: freebsd15-0c, freebsd16-0c os: ${{ fromJson(needs.test-config.outputs.test_os) }} runs-on: ubuntu-24.04 steps: @@ -96,8 +77,12 @@ jobs: ref: ${{ github.event.pull_request.head.sha }} - name: Setup QEMU - timeout-minutes: 10 - run: .github/workflows/scripts/qemu-1-setup.sh + timeout-minutes: 20 + run: | + # Add a timestamp to each line to debug timeouts + while IFS=$'\n' read -r line; do + echo "$(date +'%H:%M:%S') $line" + done < <(.github/workflows/scripts/qemu-1-setup.sh) - name: Start build machine timeout-minutes: 10 diff --git a/.github/workflows/zloop.yml b/.github/workflows/zloop.yml index 7b3bf49d90d5..4ae3ccdc5484 100644 --- a/.github/workflows/zloop.yml +++ b/.github/workflows/zloop.yml @@ -12,7 +12,8 @@ jobs: zloop: runs-on: ubuntu-24.04 env: - TEST_DIR: /var/tmp/zloop + WORK_DIR: /mnt/zloop + CORE_DIR: /mnt/zloop/cores steps: - uses: actions/checkout@v4 with: @@ -40,38 +41,37 @@ jobs: sudo modprobe zfs - name: Tests run: | - sudo mkdir -p $TEST_DIR - # run for 10 minutes or at most 6 iterations for a maximum runner - # time of 60 minutes. - sudo /usr/share/zfs/zloop.sh -t 600 -I 6 -l -m 1 -- -T 120 -P 60 + sudo truncate -s 256G /mnt/vdev + sudo zpool create cipool -m $WORK_DIR -O compression=on -o autotrim=on /mnt/vdev + sudo /usr/share/zfs/zloop.sh -t 600 -I 6 -l -m 1 -c $CORE_DIR -f $WORK_DIR -- -T 120 -P 60 - name: Prepare artifacts if: failure() run: | - sudo chmod +r -R $TEST_DIR/ + sudo chmod +r -R $WORK_DIR/ - name: Ztest log if: failure() run: | - grep -B10 -A1000 'ASSERT' $TEST_DIR/*/ztest.out || tail -n 1000 $TEST_DIR/*/ztest.out + grep -B10 -A1000 'ASSERT' $CORE_DIR/*/ztest.out || tail -n 1000 $CORE_DIR/*/ztest.out - name: Gdb log if: failure() run: | - sed -n '/Backtraces (full)/q;p' $TEST_DIR/*/ztest.gdb + sed -n '/Backtraces (full)/q;p' $CORE_DIR/*/ztest.gdb - name: Zdb log if: failure() run: | - cat $TEST_DIR/*/ztest.zdb + cat $CORE_DIR/*/ztest.zdb - uses: actions/upload-artifact@v4 if: failure() with: name: Logs path: | - /var/tmp/zloop/*/ - !/var/tmp/zloop/*/vdev/ + /mnt/zloop/*/ + !/mnt/zloop/cores/*/vdev/ if-no-files-found: ignore - uses: actions/upload-artifact@v4 if: failure() with: name: Pool files path: | - /var/tmp/zloop/*/vdev/ + /mnt/zloop/cores/*/vdev/ if-no-files-found: ignore diff --git a/META b/META index 9e971a564912..d3cf1eeabec5 100644 --- a/META +++ b/META @@ -1,10 +1,10 @@ Meta: 1 Name: zfs Branch: 1.0 -Version: 2.2.8 +Version: 2.2.9 Release: 1 Release-Tags: relext License: CDDL Author: OpenZFS -Linux-Maximum: 6.15 +Linux-Maximum: 6.17 Linux-Minimum: 4.18 diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 9f81292f06e7..7c498089e858 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -912,19 +912,15 @@ zfs_do_clone(int argc, char **argv) } /* - * Return a default volblocksize for the pool which always uses more than - * half of the data sectors. This primarily applies to dRAID which always - * writes full stripe widths. + * Calculate the minimum allocation size based on the top-level vdevs. */ static uint64_t -default_volblocksize(zpool_handle_t *zhp, nvlist_t *props) +calculate_volblocksize(nvlist_t *config) { - uint64_t volblocksize, asize = SPA_MINBLOCKSIZE; + uint64_t asize = SPA_MINBLOCKSIZE; nvlist_t *tree, **vdevs; uint_t nvdevs; - nvlist_t *config = zpool_get_config(zhp, NULL); - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 || nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &vdevs, &nvdevs) != 0) { @@ -955,6 +951,24 @@ default_volblocksize(zpool_handle_t *zhp, nvlist_t *props) } } + return (asize); +} + +/* + * Return a default volblocksize for the pool which always uses more than + * half of the data sectors. This primarily applies to dRAID which always + * writes full stripe widths. + */ +static uint64_t +default_volblocksize(zpool_handle_t *zhp, nvlist_t *props) +{ + uint64_t volblocksize, asize = SPA_MINBLOCKSIZE; + + nvlist_t *config = zpool_get_config(zhp, NULL); + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_MAX_ALLOC, &asize) != 0) + asize = calculate_volblocksize(config); + /* * Calculate the target volblocksize such that more than half * of the asize is used. The following table is for 4k sectors. @@ -7416,6 +7430,7 @@ unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual) struct extmnttab entry; const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount"; ino_t path_inode; + char *zfs_mntpnt, *entry_mntpnt; /* * Search for the given (major,minor) pair in the mount table. @@ -7457,6 +7472,24 @@ unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual) goto out; } + /* + * If the filesystem is mounted, check that the mountpoint matches + * the one in the mnttab entry w.r.t. provided path. If it doesn't, + * then we should not proceed further. + */ + entry_mntpnt = strdup(entry.mnt_mountp); + if (zfs_is_mounted(zhp, &zfs_mntpnt)) { + if (strcmp(zfs_mntpnt, entry_mntpnt) != 0) { + (void) fprintf(stderr, gettext("cannot %s '%s': " + "not an original mountpoint\n"), cmdname, path); + free(zfs_mntpnt); + free(entry_mntpnt); + goto out; + } + free(zfs_mntpnt); + } + free(entry_mntpnt); + if (op == OP_SHARE) { char nfs_mnt_prop[ZFS_MAXPROPLEN]; char smbshare_prop[ZFS_MAXPROPLEN]; diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index fbd4b81dfacc..820382845753 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -608,23 +608,29 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) verify(nvlist_lookup_string(cnv, ZPOOL_CONFIG_PATH, &path) == 0); + /* + * Skip active spares they should never cause + * the pool to be evaluated as inconsistent. + */ + if (is_spare(NULL, path)) + continue; + /* * If we have a raidz/mirror that combines disks - * with files, report it as an error. + * with files, only report it as an error when + * fatal is set to ensure all the replication + * checks aren't skipped in check_replication(). */ - if (!dontreport && type != NULL && + if (fatal && !dontreport && type != NULL && strcmp(type, childtype) != 0) { if (ret != NULL) free(ret); ret = NULL; - if (fatal) - vdev_error(gettext( - "mismatched replication " - "level: %s contains both " - "files and devices\n"), - rep.zprl_type); - else - return (NULL); + vdev_error(gettext( + "mismatched replication " + "level: %s contains both " + "files and devices\n"), + rep.zprl_type); dontreport = B_TRUE; } diff --git a/config/ax_python_devel.m4 b/config/ax_python_devel.m4 index 1f480db6d233..935056cc4c0a 100644 --- a/config/ax_python_devel.m4 +++ b/config/ax_python_devel.m4 @@ -72,7 +72,7 @@ # modified version of the Autoconf Macro, you may extend this special # exception to the GPL to apply to your modified version as well. -#serial 36 +#serial 37 AU_ALIAS([AC_PYTHON_DEVEL], [AX_PYTHON_DEVEL]) AC_DEFUN([AX_PYTHON_DEVEL],[ @@ -316,7 +316,7 @@ EOD` PYTHON_LIBS="-L$ac_python_libdir -lpython$ac_python_version" fi - if test -z "PYTHON_LIBS"; then + if test -z "$PYTHON_LIBS"; then AC_MSG_WARN([ Cannot determine location of your Python DSO. Please check it was installed with dynamic libraries enabled, or try setting PYTHON_LIBS by hand. diff --git a/config/kernel-dentry-operations.m4 b/config/kernel-dentry-operations.m4 index aa5a9f2aff39..9d1bb3a74b1d 100644 --- a/config/kernel-dentry-operations.m4 +++ b/config/kernel-dentry-operations.m4 @@ -24,6 +24,9 @@ dnl # dnl # 2.6.38 API change dnl # Added d_set_d_op() helper function. dnl # +dnl # 6.17 API change +dnl # d_set_d_op() removed. No direct replacement. +dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_D_SET_D_OP], [ ZFS_LINUX_TEST_SRC([d_set_d_op], [ #include @@ -34,11 +37,12 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_D_SET_D_OP], [ AC_DEFUN([ZFS_AC_KERNEL_D_SET_D_OP], [ AC_MSG_CHECKING([whether d_set_d_op() is available]) - ZFS_LINUX_TEST_RESULT_SYMBOL([d_set_d_op], - [d_set_d_op], [fs/dcache.c], [ + ZFS_LINUX_TEST_RESULT([d_set_d_op], [ AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_D_SET_D_OP, 1, + [Define if d_set_d_op() is available]) ], [ - ZFS_LINUX_TEST_ERROR([d_set_d_op]) + AC_MSG_RESULT(no) ]) ]) diff --git a/config/kernel-objtool.m4 b/config/kernel-objtool.m4 index e616ccebcbc0..3020440eb388 100644 --- a/config/kernel-objtool.m4 +++ b/config/kernel-objtool.m4 @@ -49,6 +49,15 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_OBJTOOL], [ #error "STACK_FRAME_NON_STANDARD is not defined." #endif ]) + + dnl # 6.15 made CONFIG_OBJTOOL_WERROR=y the default. We need to handle + dnl # this or our build will fail. + ZFS_LINUX_TEST_SRC([config_objtool_werror], [ + #if !defined(CONFIG_OBJTOOL_WERROR) + #error "CONFIG_OBJTOOL_WERROR is not defined." + #endif + ]) + ]) AC_DEFUN([ZFS_AC_KERNEL_OBJTOOL], [ @@ -84,6 +93,14 @@ AC_DEFUN([ZFS_AC_KERNEL_OBJTOOL], [ ],[ AC_MSG_RESULT(no) ]) + + AC_MSG_CHECKING([whether CONFIG_OBJTOOL_WERROR is defined]) + ZFS_LINUX_TEST_RESULT([config_objtool_werror],[ + AC_MSG_RESULT(yes) + CONFIG_OBJTOOL_WERROR_DEFINED=yes + ],[ + AC_MSG_RESULT(no) + ]) ],[ AC_MSG_RESULT(no) ]) diff --git a/config/kernel-pagemap-readahead-page.m4 b/config/kernel-pagemap-readahead-page.m4 new file mode 100644 index 000000000000..30f3d56682fb --- /dev/null +++ b/config/kernel-pagemap-readahead-page.m4 @@ -0,0 +1,23 @@ +dnl # +dnl # Linux 6.16 removed readahead_page +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_PAGEMAP_READAHEAD_PAGE], [ + ZFS_LINUX_TEST_SRC([pagemap_has_readahead_page], [ + #include + ], [ + struct page *p __attribute__ ((unused)) = NULL; + struct readahead_control *ractl __attribute__ ((unused)) = NULL; + p = readahead_page(ractl); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_PAGEMAP_READAHEAD_PAGE], [ + AC_MSG_CHECKING([whether readahead_page() exists]) + ZFS_LINUX_TEST_RESULT([pagemap_has_readahead_page], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_PAGEMAP_READAHEAD_PAGE, 1, + [readahead_page() exists]) + ],[ + AC_MSG_RESULT([no]) + ]) +]) diff --git a/config/kernel-readpages.m4 b/config/kernel-vfs-readpages.m4 similarity index 100% rename from config/kernel-readpages.m4 rename to config/kernel-vfs-readpages.m4 diff --git a/config/kernel-vfs-writepage.m4 b/config/kernel-vfs-writepage.m4 new file mode 100644 index 000000000000..d438e85b457c --- /dev/null +++ b/config/kernel-vfs-writepage.m4 @@ -0,0 +1,24 @@ +dnl # +dnl # Linux 6.16 removes address_space_operations ->writepage +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_WRITEPAGE], [ + ZFS_LINUX_TEST_SRC([vfs_has_writepage], [ + #include + + static const struct address_space_operations + aops __attribute__ ((unused)) = { + .writepage = NULL, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_VFS_WRITEPAGE], [ + AC_MSG_CHECKING([whether aops->writepage exists]) + ZFS_LINUX_TEST_RESULT([vfs_has_writepage], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_VFS_WRITEPAGE, 1, + [address_space_operations->writepage exists]) + ],[ + AC_MSG_RESULT([no]) + ]) +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index f0a4dc0fe430..1cc99458ce20 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -83,6 +83,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO ZFS_AC_KERNEL_SRC_VFS_READPAGES + ZFS_AC_KERNEL_SRC_VFS_WRITEPAGE ZFS_AC_KERNEL_SRC_VFS_SET_PAGE_DIRTY_NOBUFFERS ZFS_AC_KERNEL_SRC_VFS_IOV_ITER ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE @@ -112,6 +113,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_STANDALONE_LINUX_STDARG ZFS_AC_KERNEL_SRC_STRLCPY ZFS_AC_KERNEL_SRC_PAGEMAP_FOLIO_WAIT_BIT + ZFS_AC_KERNEL_SRC_PAGEMAP_READAHEAD_PAGE ZFS_AC_KERNEL_SRC_ADD_DISK ZFS_AC_KERNEL_SRC_KTHREAD ZFS_AC_KERNEL_SRC_ZERO_PAGE @@ -197,6 +199,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_VFS_FSYNC_2ARGS ZFS_AC_KERNEL_VFS_DIRECT_IO ZFS_AC_KERNEL_VFS_READPAGES + ZFS_AC_KERNEL_VFS_WRITEPAGE ZFS_AC_KERNEL_VFS_SET_PAGE_DIRTY_NOBUFFERS ZFS_AC_KERNEL_VFS_IOV_ITER ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE @@ -226,6 +229,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_STANDALONE_LINUX_STDARG ZFS_AC_KERNEL_STRLCPY ZFS_AC_KERNEL_PAGEMAP_FOLIO_WAIT_BIT + ZFS_AC_KERNEL_PAGEMAP_READAHEAD_PAGE ZFS_AC_KERNEL_ADD_DISK ZFS_AC_KERNEL_KTHREAD ZFS_AC_KERNEL_ZERO_PAGE diff --git a/config/user-statx.m4 b/config/user-statx.m4 new file mode 100644 index 000000000000..0315f93e0c20 --- /dev/null +++ b/config/user-statx.m4 @@ -0,0 +1,34 @@ +dnl # +dnl # Check for statx() function and STATX_MNT_ID availability +dnl # +AC_DEFUN([ZFS_AC_CONFIG_USER_STATX], [ + AC_CHECK_HEADERS([linux/stat.h], + [have_stat_headers=yes], + [have_stat_headers=no]) + + AS_IF([test "x$have_stat_headers" = "xyes"], [ + AC_CHECK_FUNC([statx], [ + AC_DEFINE([HAVE_STATX], [1], [statx() is available]) + + dnl Check for STATX_MNT_ID availability + AC_MSG_CHECKING([for STATX_MNT_ID]) + AC_COMPILE_IFELSE([ + AC_LANG_PROGRAM([[ + #include + ]], [[ + struct statx stx; + int mask = STATX_MNT_ID; + (void)mask; + (void)stx.stx_mnt_id; + ]]) + ], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_STATX_MNT_ID], [1], [STATX_MNT_ID is available]) + ], [ + AC_MSG_RESULT([no]) + ]) + ]) + ], [ + AC_MSG_WARN([linux/stat.h not found; skipping statx support]) + ]) +]) dnl end AC_DEFUN diff --git a/config/user.m4 b/config/user.m4 index 4e31745a2abc..b2b882ce6c88 100644 --- a/config/user.m4 +++ b/config/user.m4 @@ -17,6 +17,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [ ZFS_AC_CONFIG_USER_LIBUDEV ZFS_AC_CONFIG_USER_LIBUUID ZFS_AC_CONFIG_USER_LIBBLKID + ZFS_AC_CONFIG_USER_STATX ]) ZFS_AC_CONFIG_USER_LIBTIRPC ZFS_AC_CONFIG_USER_LIBCRYPTO diff --git a/config/zfs-build.m4 b/config/zfs-build.m4 index 57582b9d18f5..4d82a7324e2e 100644 --- a/config/zfs-build.m4 +++ b/config/zfs-build.m4 @@ -205,6 +205,46 @@ AC_DEFUN([ZFS_AC_DEBUG_INVARIANTS], [ AC_MSG_RESULT([$enable_invariants]) ]) +dnl # Disabled by default. If enabled allows a configured "turn objtools +dnl # warnings into errors" (CONFIG_OBJTOOL_WERROR) behavior to take effect. +dnl # If disabled, objtool warnings are never turned into errors. It can't +dnl # be enabled if the kernel wasn't compiled with CONFIG_OBJTOOL_WERROR=y. +dnl # +AC_DEFUN([ZFS_AC_OBJTOOL_WERROR], [ + AC_MSG_CHECKING([whether objtool error on warning behavior is enabled]) + AC_ARG_ENABLE([objtool-werror], + [AS_HELP_STRING([--enable-objtool-werror], + [Enable objtool's error on warning behaviour if present @<:@default=no@:>@])], + [enable_objtool_werror=$enableval], + [enable_objtool_werror=no]) + AC_MSG_RESULT([$enable_objtool_werror]) + + AS_IF([test x$CONFIG_OBJTOOL_WERROR_DEFINED = xyes],[ + AS_IF([test x$enable_objtool_werror = xyes],[ + AC_MSG_NOTICE([enable-objtool-werror defined, keeping -Werror ]) + ],[ + AC_MSG_NOTICE([enable-objtool-werror undefined, disabling -Werror ]) + OBJTOOL_DISABLE_WERROR=y + abs_objtool_binary=$kernelsrc/tools/objtool/objtool + AS_IF([test -x $abs_objtool_binary],[],[ + AC_MSG_ERROR([*** objtool binary $abs_objtool_binary not found]) + ]) + dnl # The path to the wrapper is defined in modules/Makefile.in. + ]) + ],[ + dnl # We can't enable --Werror if it's not there. + AS_IF([test x$enable_objtool_werror = xyes],[ + AC_MSG_ERROR([ + *** Cannot enable objtool-werror, + *** a kernel built with CONFIG_OBJTOOL_WERROR=y is required. + ]) + ],[]) + ]) + + AC_SUBST(OBJTOOL_DISABLE_WERROR) + AC_SUBST(abs_objtool_binary) +]) + AC_DEFUN([ZFS_AC_CONFIG_ALWAYS], [ AX_COUNT_CPUS([]) AC_SUBST(CPU_COUNT) @@ -514,32 +554,35 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ [with_vendor=$withval], [with_vendor=check]) AS_IF([test "x$with_vendor" = "xcheck"],[ - if test -f /etc/toss-release ; then - VENDOR=toss ; + if test -f /etc/alpine-release ; then + VENDOR=alpine ; + elif test -f /etc/arch-release ; then + VENDOR=arch ; + elif test -f /etc/artix-release ; then + VENDOR=artix ; elif test -f /etc/fedora-release ; then VENDOR=fedora ; - elif test -f /etc/redhat-release ; then - VENDOR=redhat ; + elif test -f /bin/freebsd-version ; then + VENDOR=freebsd ; elif test -f /etc/gentoo-release ; then VENDOR=gentoo ; - elif test -f /etc/arch-release ; then - VENDOR=arch ; + elif test -f /etc/lunar.release ; then + VENDOR=lunar ; + elif test -f /etc/openEuler-release ; then + VENDOR=openeuler ; elif test -f /etc/SuSE-release ; then VENDOR=sles ; elif test -f /etc/slackware-version ; then VENDOR=slackware ; - elif test -f /etc/lunar.release ; then - VENDOR=lunar ; + elif test -f /etc/toss-release ; then + VENDOR=toss ; elif test -f /etc/lsb-release ; then VENDOR=ubuntu ; + # put debian and redhat last as derivatives may have also their file elif test -f /etc/debian_version ; then VENDOR=debian ; - elif test -f /etc/alpine-release ; then - VENDOR=alpine ; - elif test -f /bin/freebsd-version ; then - VENDOR=freebsd ; - elif test -f /etc/openEuler-release ; then - VENDOR=openeuler ; + elif test -f /etc/redhat-release ; then + VENDOR=redhat ; else VENDOR= ; fi], @@ -552,20 +595,15 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ AC_MSG_CHECKING([default package type]) case "$VENDOR" in - toss) DEFAULT_PACKAGE=rpm ;; - redhat) DEFAULT_PACKAGE=rpm ;; - fedora) DEFAULT_PACKAGE=rpm ;; - gentoo) DEFAULT_PACKAGE=tgz ;; - alpine) DEFAULT_PACKAGE=tgz ;; - arch) DEFAULT_PACKAGE=tgz ;; - sles) DEFAULT_PACKAGE=rpm ;; - slackware) DEFAULT_PACKAGE=tgz ;; - lunar) DEFAULT_PACKAGE=tgz ;; - ubuntu) DEFAULT_PACKAGE=deb ;; - debian) DEFAULT_PACKAGE=deb ;; - freebsd) DEFAULT_PACKAGE=pkg ;; - openeuler) DEFAULT_PACKAGE=rpm ;; - *) DEFAULT_PACKAGE=rpm ;; + alpine|arch|artix|gentoo|lunar|slackware) + DEFAULT_PACKAGE=tgz ;; + debian|ubuntu) + DEFAULT_PACKAGE=deb ;; + freebsd) + DEFAULT_PACKAGE=pkg ;; + *) + # fedora|openeuler|redhat|sles|toss + DEFAULT_PACKAGE=rpm ;; esac AC_MSG_RESULT([$DEFAULT_PACKAGE]) AC_SUBST(DEFAULT_PACKAGE) @@ -580,7 +618,9 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ AC_MSG_CHECKING([default shell]) case "$VENDOR" in - gentoo|alpine) DEFAULT_INIT_SHELL=/sbin/openrc-run + alpine|gentoo) DEFAULT_INIT_SHELL=/sbin/openrc-run + IS_SYSV_RC=false ;; + artix) DEFAULT_INIT_SHELL=/usr/bin/openrc-run IS_SYSV_RC=false ;; *) DEFAULT_INIT_SHELL=/bin/sh IS_SYSV_RC=true ;; @@ -600,17 +640,19 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ AC_MSG_CHECKING([default init config directory]) case "$VENDOR" in - alpine) initconfdir=/etc/conf.d ;; - gentoo) initconfdir=/etc/conf.d ;; - toss) initconfdir=/etc/sysconfig ;; - redhat) initconfdir=/etc/sysconfig ;; - fedora) initconfdir=/etc/sysconfig ;; - sles) initconfdir=/etc/sysconfig ;; - openeuler) initconfdir=/etc/sysconfig ;; - ubuntu) initconfdir=/etc/default ;; - debian) initconfdir=/etc/default ;; - freebsd) initconfdir=$sysconfdir/rc.conf.d;; - *) initconfdir=/etc/default ;; + alpine|artix|gentoo) + initconfdir=/etc/conf.d + ;; + fedora|openeuler|redhat|sles|toss) + initconfdir=/etc/sysconfig + ;; + freebsd) + initconfdir=$sysconfdir/rc.conf.d + ;; + *) + # debian|ubuntu + initconfdir=/etc/default + ;; esac AC_MSG_RESULT([$initconfdir]) AC_SUBST(initconfdir) @@ -627,11 +669,15 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ AC_MSG_CHECKING([default bash completion directory]) case "$VENDOR" in - ubuntu) bashcompletiondir=/usr/share/bash-completion/completions ;; - debian) bashcompletiondir=/usr/share/bash-completion/completions ;; - freebsd) bashcompletiondir=$sysconfdir/bash_completion.d;; - gentoo) bashcompletiondir=/usr/share/bash-completion/completions ;; - *) bashcompletiondir=/etc/bash_completion.d ;; + arch|artix|debian|gentoo|ubuntu) + bashcompletiondir=/usr/share/bash-completion/completions + ;; + freebsd) + bashcompletiondir=$sysconfdir/bash_completion.d + ;; + *) + bashcompletiondir=/etc/bash_completion.d + ;; esac AC_MSG_RESULT([$bashcompletiondir]) AC_SUBST(bashcompletiondir) diff --git a/configure.ac b/configure.ac index f31fe1db81e4..fb4b704a9f5c 100644 --- a/configure.ac +++ b/configure.ac @@ -64,6 +64,7 @@ ZFS_AC_DEBUGINFO ZFS_AC_DEBUG_KMEM ZFS_AC_DEBUG_KMEM_TRACKING ZFS_AC_DEBUG_INVARIANTS +ZFS_AC_OBJTOOL_WERROR AC_CONFIG_FILES([ contrib/debian/rules @@ -85,5 +86,6 @@ AC_CONFIG_FILES([ zfs.release ]) +AC_CONFIG_FILES([scripts/objtool-wrapper], [chmod +x scripts/objtool-wrapper]) AC_OUTPUT diff --git a/contrib/debian/rules.in b/contrib/debian/rules.in index 3226d604546c..2b0568938b25 100755 --- a/contrib/debian/rules.in +++ b/contrib/debian/rules.in @@ -93,7 +93,7 @@ override_dh_auto_install: @# Install the DKMS source. @# We only want the files needed to build the modules install -D -t '$(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)/scripts' \ - '$(CURDIR)/scripts/dkms.postbuild' + '$(CURDIR)/scripts/dkms.postbuild' '$(CURDIR)/scripts/objtool-wrapper.in' $(foreach file,$(DKMSFILES),mv '$(CURDIR)/$(NAME)-$(DEB_VERSION_UPSTREAM)/$(file)' '$(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)' || exit 1;) @# Only ever build Linux modules @@ -108,8 +108,8 @@ override_dh_auto_install: @# - zfs.release$ @# * Takes care of spaces and tabs @# * Remove reference to ZFS_AC_PACKAGE - awk '/^AC_CONFIG_FILES\(\[/,/^\]\)/ {\ - if ($$0 !~ /^(AC_CONFIG_FILES\(\[([ \t]+)?$$|\]\)([ \t]+)?$$|([ \t]+)?(include\/(Makefile|sys|os\/(Makefile|linux))|module\/|Makefile([ \t]+)?$$|zfs\.release([ \t]+)?$$))/) \ + awk '/^AC_CONFIG_FILES\(\[/,/\]\)/ {\ + if ($$0 !~ /^(AC_CONFIG_FILES\(\[([ \t]+)?$$|\]\)([ \t]+)?$$|([ \t]+)?(include\/(Makefile|sys|os\/(Makefile|linux))|module\/|Makefile([ \t]+)?$$|zfs\.release([ \t]+)?$$))|scripts\/objtool-wrapper.*\]\)$$/) \ {next} } {print}' \ '$(CURDIR)/$(NAME)-$(DEB_VERSION_UPSTREAM)/configure.ac' | sed '/ZFS_AC_PACKAGE/d' > '$(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)/configure.ac' @# Set "SUBDIRS = module include" for CONFIG_KERNEL and remove SUBDIRS for all other configs. diff --git a/contrib/initramfs/scripts/zfs b/contrib/initramfs/scripts/zfs index c569b2528368..67707e9d80f4 100644 --- a/contrib/initramfs/scripts/zfs +++ b/contrib/initramfs/scripts/zfs @@ -979,7 +979,8 @@ mountroot() touch /run/zfs_unlock_complete if [ -e /run/zfs_unlock_complete_notify ]; then - read -r < /run/zfs_unlock_complete_notify + # shellcheck disable=SC2034 + read -r zfs_unlock_complete_notify < /run/zfs_unlock_complete_notify fi # ------------ diff --git a/contrib/intel_qat/readme.md b/contrib/intel_qat/readme.md index 7e45d395bb80..04c299b6404c 100644 --- a/contrib/intel_qat/readme.md +++ b/contrib/intel_qat/readme.md @@ -8,7 +8,7 @@ This contrib contains community compatibility patches to get Intel QAT working o These patches are based on the following Intel QAT version: [1.7.l.4.10.0-00014](https://01.org/sites/default/files/downloads/qat1.7.l.4.10.0-00014.tar.gz) -When using QAT with above kernels versions, the following patches needs to be applied using: +When using QAT with the above kernel versions, the following patches need to be applied using: patch -p1 < _$PATCH_ _Where $PATCH refers to the path of the patch in question_ diff --git a/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py b/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py index c94ae6de6bbf..136d48350ef1 100644 --- a/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py +++ b/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py @@ -4222,7 +4222,7 @@ def reset(self): self.getRoot().reset() return - # On the Buildbot builders this may fail with "pool is busy" + # On the CI builders this may fail with "pool is busy" # Retry 5 times before raising an error retry = 0 while True: diff --git a/etc/init.d/README.md b/etc/init.d/README.md index da780fdc1222..3852dd9a6b2e 100644 --- a/etc/init.d/README.md +++ b/etc/init.d/README.md @@ -1,5 +1,5 @@ DESCRIPTION - These script were written with the primary intention of being portable and + These scripts were written with the primary intention of being portable and usable on as many systems as possible. This is, in practice, usually not possible. But the intention is there. diff --git a/include/os/freebsd/spl/sys/misc.h b/include/os/freebsd/spl/sys/misc.h index 894ccd8bf9b1..96f7207d28b5 100644 --- a/include/os/freebsd/spl/sys/misc.h +++ b/include/os/freebsd/spl/sys/misc.h @@ -55,4 +55,9 @@ struct opensolaris_utsname { #define task_io_account_read(n) #define task_io_account_write(n) +/* + * Check if the current thread is a memory reclaim thread. + */ +extern int current_is_reclaim_thread(void); + #endif /* _OPENSOLARIS_SYS_MISC_H_ */ diff --git a/include/os/linux/Makefile.am b/include/os/linux/Makefile.am index b4ffbcff9533..c0c9dd1398ce 100644 --- a/include/os/linux/Makefile.am +++ b/include/os/linux/Makefile.am @@ -8,6 +8,7 @@ kernel_linux_HEADERS = \ %D%/kernel/linux/mm_compat.h \ %D%/kernel/linux/mod_compat.h \ %D%/kernel/linux/page_compat.h \ + %D%/kernel/linux/pagemap_compat.h \ %D%/kernel/linux/simd.h \ %D%/kernel/linux/simd_aarch64.h \ %D%/kernel/linux/simd_arm.h \ diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h index 26e7b0b2a34a..7f5fe0e5218b 100644 --- a/include/os/linux/kernel/linux/blkdev_compat.h +++ b/include/os/linux/kernel/linux/blkdev_compat.h @@ -514,24 +514,6 @@ blk_generic_alloc_queue(make_request_fn make_request, int node_id) } #endif /* !HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ -/* - * All the io_*() helper functions below can operate on a bio, or a rq, but - * not both. The older submit_bio() codepath will pass a bio, and the - * newer blk-mq codepath will pass a rq. - */ -static inline int -io_data_dir(struct bio *bio, struct request *rq) -{ - if (rq != NULL) { - if (op_is_write(req_op(rq))) { - return (WRITE); - } else { - return (READ); - } - } - return (bio_data_dir(bio)); -} - static inline int io_is_flush(struct bio *bio, struct request *rq) { diff --git a/include/os/linux/kernel/linux/dcache_compat.h b/include/os/linux/kernel/linux/dcache_compat.h index de533a5fd28b..72f603c523d8 100644 --- a/include/os/linux/kernel/linux/dcache_compat.h +++ b/include/os/linux/kernel/linux/dcache_compat.h @@ -59,32 +59,6 @@ } while (0) #endif -/* - * 2.6.30 API change, - * The const keyword was added to the 'struct dentry_operations' in - * the dentry structure. To handle this we define an appropriate - * dentry_operations_t typedef which can be used. - */ -typedef const struct dentry_operations dentry_operations_t; - -/* - * 2.6.38 API addition, - * Added d_clear_d_op() helper function which clears some flags and the - * registered dentry->d_op table. This is required because d_set_d_op() - * issues a warning when the dentry operations table is already set. - * For the .zfs control directory to work properly we must be able to - * override the default operations table and register custom .d_automount - * and .d_revalidate callbacks. - */ -static inline void -d_clear_d_op(struct dentry *dentry) -{ - dentry->d_op = NULL; - dentry->d_flags &= ~( - DCACHE_OP_HASH | DCACHE_OP_COMPARE | - DCACHE_OP_REVALIDATE | DCACHE_OP_DELETE); -} - /* * Walk and invalidate all dentry aliases of an inode * unless it's a mountpoint diff --git a/include/os/linux/kernel/linux/pagemap_compat.h b/include/os/linux/kernel/linux/pagemap_compat.h new file mode 100644 index 000000000000..a0465ede0105 --- /dev/null +++ b/include/os/linux/kernel/linux/pagemap_compat.h @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2025, Rob Norris + */ + +#ifndef _ZFS_PAGEMAP_COMPAT_H +#define _ZFS_PAGEMAP_COMPAT_H + +#include + +#ifndef HAVE_PAGEMAP_READAHEAD_PAGE +#define readahead_page(ractl) (&(__readahead_folio(ractl)->page)) +#endif + +#endif diff --git a/include/os/linux/spl/sys/misc.h b/include/os/linux/spl/sys/misc.h index 299fe9c1ab07..f59d5dea2ee0 100644 --- a/include/os/linux/spl/sys/misc.h +++ b/include/os/linux/spl/sys/misc.h @@ -23,7 +23,13 @@ #define _OS_LINUX_SPL_MISC_H #include +#include extern void spl_signal_kobj_evt(struct block_device *bdev); +/* + * Check if the current thread is a memory reclaim thread. + */ +extern int current_is_reclaim_thread(void); + #endif diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index defebe3b2fbb..9f45a49afb37 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -952,7 +952,7 @@ typedef struct arc_sums { wmsum_t arcstat_data_size; wmsum_t arcstat_metadata_size; wmsum_t arcstat_dbuf_size; - wmsum_t arcstat_dnode_size; + aggsum_t arcstat_dnode_size; wmsum_t arcstat_bonus_size; wmsum_t arcstat_l2_hits; wmsum_t arcstat_l2_misses; @@ -1058,10 +1058,10 @@ extern uint_t arc_lotsfree_percent; extern uint64_t zfs_arc_min; extern uint64_t zfs_arc_max; -extern void arc_reduce_target_size(int64_t to_free); +extern uint64_t arc_reduce_target_size(uint64_t to_free); extern boolean_t arc_reclaim_needed(void); extern void arc_kmem_reap_soon(void); -extern void arc_wait_for_eviction(uint64_t, boolean_t); +extern void arc_wait_for_eviction(uint64_t, boolean_t, boolean_t); extern void arc_lowmem_init(void); extern void arc_lowmem_fini(void); diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 4329e4e86f2d..004e1795cc59 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -722,6 +722,8 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift" #define ZPOOL_CONFIG_ASHIFT "ashift" #define ZPOOL_CONFIG_ASIZE "asize" +#define ZPOOL_CONFIG_MIN_ALLOC "min_alloc" +#define ZPOOL_CONFIG_MAX_ALLOC "max_alloc" #define ZPOOL_CONFIG_DTL "DTL" #define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */ #define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */ diff --git a/include/sys/spa.h b/include/sys/spa.h index 6611141b9569..76e4446d3942 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1029,6 +1029,7 @@ extern pool_state_t spa_state(spa_t *spa); extern spa_load_state_t spa_load_state(spa_t *spa); extern uint64_t spa_freeze_txg(spa_t *spa); extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize); +extern void spa_get_min_alloc_range(spa_t *spa, uint64_t *min, uint64_t *max); extern uint64_t spa_get_dspace(spa_t *spa); extern uint64_t spa_get_checkpoint_space(spa_t *spa); extern uint64_t spa_get_slop_space(spa_t *spa); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index c7ecd3d0ccd3..a5c2eb0c705b 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -250,6 +250,7 @@ struct spa { uint64_t spa_min_ashift; /* of vdevs in normal class */ uint64_t spa_max_ashift; /* of vdevs in normal class */ uint64_t spa_min_alloc; /* of vdevs in normal class */ + uint64_t spa_max_alloc; /* of vdevs in normal class */ uint64_t spa_gcd_alloc; /* of vdevs in normal class */ uint64_t spa_config_guid; /* config pool guid */ uint64_t spa_load_guid; /* spa_load initialized guid */ diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 5dedb14c7fb5..75e3157852cb 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -235,6 +235,11 @@ typedef pthread_t kthread_t; #define thread_join(t) pthread_join((pthread_t)(t), NULL) #define newproc(f, a, cid, pri, ctp, pid) (ENOSYS) +/* + * Check if the current thread is a memory reclaim thread. + * Always returns false in userspace (no memory reclaim thread). + */ +#define current_is_reclaim_thread() (0) /* in libzpool, p0 exists only to have its address taken */ typedef struct proc { diff --git a/lib/libspl/include/os/linux/sys/stat.h b/lib/libspl/include/os/linux/sys/stat.h index e7f592620512..5fbe892ee1c1 100644 --- a/lib/libspl/include/os/linux/sys/stat.h +++ b/lib/libspl/include/os/linux/sys/stat.h @@ -30,6 +30,11 @@ #include /* for BLKGETSIZE64 */ +#ifdef HAVE_STATX +#include +#include +#endif + /* * Emulate Solaris' behavior of returning the block device size in fstat64(). */ diff --git a/lib/libspl/os/linux/getmntany.c b/lib/libspl/os/linux/getmntany.c index a46c4e931719..b09def344d2d 100644 --- a/lib/libspl/os/linux/getmntany.c +++ b/lib/libspl/os/linux/getmntany.c @@ -84,13 +84,21 @@ _sol_getmntent(FILE *fp, struct mnttab *mgetp) } static int -getextmntent_impl(FILE *fp, struct extmnttab *mp) +getextmntent_impl(FILE *fp, struct extmnttab *mp, uint64_t *mnt_id) { int ret; struct stat64 st; + *mnt_id = 0; ret = _sol_getmntent(fp, (struct mnttab *)mp); if (ret == 0) { +#ifdef HAVE_STATX_MNT_ID + struct statx stx; + if (statx(AT_FDCWD, mp->mnt_mountp, + AT_STATX_SYNC_AS_STAT | AT_SYMLINK_NOFOLLOW, + STATX_MNT_ID, &stx) == 0 && (stx.stx_mask & STATX_MNT_ID)) + *mnt_id = stx.stx_mnt_id; +#endif if (stat64(mp->mnt_mountp, &st) != 0) { mp->mnt_major = 0; mp->mnt_minor = 0; @@ -109,6 +117,12 @@ getextmntent(const char *path, struct extmnttab *entry, struct stat64 *statbuf) struct stat64 st; FILE *fp; int match; + boolean_t have_mnt_id = B_FALSE; + uint64_t target_mnt_id = 0; + uint64_t entry_mnt_id; +#ifdef HAVE_STATX_MNT_ID + struct statx stx; +#endif if (strlen(path) >= MAXPATHLEN) { (void) fprintf(stderr, "invalid object; pathname too long\n"); @@ -127,6 +141,13 @@ getextmntent(const char *path, struct extmnttab *entry, struct stat64 *statbuf) return (-1); } +#ifdef HAVE_STATX_MNT_ID + if (statx(AT_FDCWD, path, AT_STATX_SYNC_AS_STAT | AT_SYMLINK_NOFOLLOW, + STATX_MNT_ID, &stx) == 0 && (stx.stx_mask & STATX_MNT_ID)) { + have_mnt_id = B_TRUE; + target_mnt_id = stx.stx_mnt_id; + } +#endif if ((fp = fopen(MNTTAB, "re")) == NULL) { (void) fprintf(stderr, "cannot open %s\n", MNTTAB); @@ -138,12 +159,15 @@ getextmntent(const char *path, struct extmnttab *entry, struct stat64 *statbuf) */ match = 0; - while (getextmntent_impl(fp, entry) == 0) { - if (makedev(entry->mnt_major, entry->mnt_minor) == - statbuf->st_dev) { - match = 1; - break; + while (getextmntent_impl(fp, entry, &entry_mnt_id) == 0) { + if (have_mnt_id) { + match = (entry_mnt_id == target_mnt_id); + } else { + match = makedev(entry->mnt_major, entry->mnt_minor) == + statbuf->st_dev; } + if (match) + break; } (void) fclose(fp); diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 1d734d865a16..fc32c46a9bd7 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -816,6 +816,13 @@ even with a small average compressed block size of ~8 KiB. The parameter can be set to 0 (zero) to disable the limit, and only applies on Linux. . +.It Sy zfs_arc_shrinker_seeks Ns = Ns Sy 2 Pq int +Relative cost of ARC eviction on Linux, AKA number of seeks needed to +restore evicted page. +Bigger values make ARC more precious and evictions smaller, comparing to +other kernel subsystems. +Value of 4 means parity with page cache. +. .It Sy zfs_arc_sys_free Ns = Ns Sy 0 Ns B Pq u64 The target number of bytes the ARC should leave as free memory on the system. If zero, equivalent to the bigger of diff --git a/module/Makefile.in b/module/Makefile.in index 529ab81dcec5..98a536a7fe1a 100644 --- a/module/Makefile.in +++ b/module/Makefile.in @@ -57,6 +57,7 @@ modules-Linux: $(if @KERNEL_LD@,LD=@KERNEL_LD@) $(if @KERNEL_LLVM@,LLVM=@KERNEL_LLVM@) \ $(if @KERNEL_CROSS_COMPILE@,CROSS_COMPILE=@KERNEL_CROSS_COMPILE@) \ $(if @KERNEL_ARCH@,ARCH=@KERNEL_ARCH@) \ + $(if @OBJTOOL_DISABLE_WERROR@,objtool=@abs_top_builddir@/scripts/objtool-wrapper) \ M="$$PWD" @KERNEL_MAKE@ CONFIG_ZFS=m modules modules-FreeBSD: diff --git a/module/os/freebsd/spl/spl_misc.c b/module/os/freebsd/spl/spl_misc.c index 2d0821417ad9..2b27f4619660 100644 --- a/module/os/freebsd/spl/spl_misc.c +++ b/module/os/freebsd/spl/spl_misc.c @@ -100,6 +100,15 @@ spl_panic(const char *file, const char *func, int line, const char *fmt, ...) va_end(ap); } +/* + * Check if the current thread is a memory reclaim thread. + * Returns true if curproc is pageproc (FreeBSD's page daemon). + */ +int +current_is_reclaim_thread(void) +{ + return (curproc == pageproc); +} SYSINIT(opensolaris_utsname_init, SI_SUB_TUNABLES, SI_ORDER_ANY, opensolaris_utsname_init, NULL); diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c index 94f0a6b9f0be..1bb5e5d63c11 100644 --- a/module/os/freebsd/zfs/arc_os.c +++ b/module/os/freebsd/zfs/arc_os.c @@ -149,18 +149,17 @@ static eventhandler_tag arc_event_lowmem = NULL; static void arc_lowmem(void *arg __unused, int howto __unused) { - int64_t free_memory, to_free; + int64_t can_free, free_memory, to_free; arc_no_grow = B_TRUE; arc_warm = B_TRUE; arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); + free_memory = arc_available_memory(); - int64_t can_free = arc_c - arc_c_min; - if (can_free <= 0) - return; - to_free = (can_free >> arc_shrink_shift) - MIN(free_memory, 0); + can_free = arc_c - arc_c_min; + to_free = (MAX(can_free, 0) >> arc_shrink_shift) - MIN(free_memory, 0); DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free); - arc_reduce_target_size(to_free); + to_free = arc_reduce_target_size(to_free); /* * It is unsafe to block here in arbitrary threads, because we can come @@ -168,7 +167,7 @@ arc_lowmem(void *arg __unused, int howto __unused) * with ARC reclaim thread. */ if (curproc == pageproc) { - arc_wait_for_eviction(to_free, B_FALSE); + arc_wait_for_eviction(to_free, B_FALSE, B_FALSE); ARCSTAT_BUMP(arcstat_memory_indirect_count); } else { ARCSTAT_BUMP(arcstat_memory_direct_count); diff --git a/module/os/linux/spl/spl-thread.c b/module/os/linux/spl/spl-thread.c index 7b0ce30c7884..b1c5bc744604 100644 --- a/module/os/linux/spl/spl-thread.c +++ b/module/os/linux/spl/spl-thread.c @@ -27,6 +27,7 @@ #include #include #include +#include /* * Thread interfaces @@ -196,3 +197,14 @@ issig(void) } EXPORT_SYMBOL(issig); + +/* + * Check if the current thread is a memory reclaim thread. + * Returns true if current thread is kswapd. + */ +int +current_is_reclaim_thread(void) +{ + return (current_is_kswapd()); +} +EXPORT_SYMBOL(current_is_reclaim_thread); diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c index 02583e49ca79..7e5a0860f915 100644 --- a/module/os/linux/zfs/arc_os.c +++ b/module/os/linux/zfs/arc_os.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -56,6 +57,7 @@ #include #include +#ifdef _KERNEL /* * This is a limit on how many pages the ARC shrinker makes available for * eviction in response to one page allocation attempt. Note that in @@ -70,11 +72,20 @@ * See also the comment in arc_shrinker_count(). * Set to 0 to disable limit. */ -int zfs_arc_shrinker_limit = 10000; +static int zfs_arc_shrinker_limit = 10000; + +/* + * Relative cost of ARC eviction, AKA number of seeks needed to restore evicted + * page. Bigger values make ARC more precious and evictions smaller comparing + * to other kernel subsystems. Value of 4 means parity with page cache, + * according to my reading of kernel's do_shrink_slab() and other code. + */ +static int zfs_arc_shrinker_seeks = DEFAULT_SEEKS; #ifdef CONFIG_MEMORY_HOTPLUG static struct notifier_block arc_hotplug_callback_mem_nb; #endif +#endif /* * Return a default max arc size based on the amount of physical memory. @@ -161,22 +172,7 @@ static unsigned long arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) { /* - * __GFP_FS won't be set if we are called from ZFS code (see - * kmem_flags_convert(), which removes it). To avoid a deadlock, we - * don't allow evicting in this case. We return 0 rather than - * SHRINK_STOP so that the shrinker logic doesn't accumulate a - * deficit against us. - */ - if (!(sc->gfp_mask & __GFP_FS)) { - return (0); - } - - /* - * This code is reached in the "direct reclaim" case, where the - * kernel (outside ZFS) is trying to allocate a page, and the system - * is low on memory. - * - * The kernel's shrinker code doesn't understand how many pages the + * The kernel's shrinker code may not understand how many pages the * ARC's callback actually frees, so it may ask the ARC to shrink a * lot for one page allocation. This is problematic because it may * take a long time, thus delaying the page allocation, and because @@ -195,40 +191,44 @@ arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) * * See also the comment above zfs_arc_shrinker_limit. */ + int64_t can_free = btop(arc_evictable_memory()); int64_t limit = zfs_arc_shrinker_limit != 0 ? zfs_arc_shrinker_limit : INT64_MAX; - return (MIN(limit, btop((int64_t)arc_evictable_memory()))); + return (MIN(can_free, limit)); } static unsigned long arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) { - ASSERT((sc->gfp_mask & __GFP_FS) != 0); - /* The arc is considered warm once reclaim has occurred */ if (unlikely(arc_warm == B_FALSE)) arc_warm = B_TRUE; + /* + * We are experiencing memory pressure which the arc_evict_zthr was + * unable to keep up with. Set arc_no_grow to briefly pause ARC + * growth to avoid compounding the memory pressure. + */ + arc_no_grow = B_TRUE; + /* * Evict the requested number of pages by reducing arc_c and waiting - * for the requested amount of data to be evicted. + * for the requested amount of data to be evicted. To avoid deadlock + * do not wait for eviction if we may be called from ZFS itself (see + * kmem_flags_convert() removing __GFP_FS). It may cause excessive + * eviction later if many evictions are accumulated, but just skipping + * the eviction is not good either if most of memory is used by ARC. */ - arc_reduce_target_size(ptob(sc->nr_to_scan)); - arc_wait_for_eviction(ptob(sc->nr_to_scan), B_FALSE); + uint64_t to_free = arc_reduce_target_size(ptob(sc->nr_to_scan)); + if (sc->gfp_mask & __GFP_FS) + arc_wait_for_eviction(to_free, B_FALSE, B_FALSE); if (current->reclaim_state != NULL) #ifdef HAVE_RECLAIM_STATE_RECLAIMED - current->reclaim_state->reclaimed += sc->nr_to_scan; + current->reclaim_state->reclaimed += btop(to_free); #else - current->reclaim_state->reclaimed_slab += sc->nr_to_scan; + current->reclaim_state->reclaimed_slab += btop(to_free); #endif - /* - * We are experiencing memory pressure which the arc_evict_zthr was - * unable to keep up with. Set arc_no_grow to briefly pause arc - * growth to avoid compounding the memory pressure. - */ - arc_no_grow = B_TRUE; - /* * When direct reclaim is observed it usually indicates a rapid * increase in memory pressure. This occurs because the kswapd @@ -241,7 +241,7 @@ arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) ARCSTAT_BUMP(arcstat_memory_direct_count); } - return (sc->nr_to_scan); + return (btop(to_free)); } static struct shrinker *arc_shrinker = NULL; @@ -295,9 +295,7 @@ arc_set_sys_free(uint64_t allmem) * arc_wait_for_eviction() will wait until at least the * high_wmark_pages() are free (see arc_evict_state_impl()). * - * Note: Even when the system is very low on memory, the kernel's - * shrinker code may only ask for one "batch" of pages (512KB) to be - * evicted. If concurrent allocations consume these pages, there may + * Note: If concurrent allocations consume these pages, there may * still be insufficient free pages, and the OOM killer takes action. * * By setting arc_sys_free large enough, and having @@ -309,20 +307,26 @@ arc_set_sys_free(uint64_t allmem) * It's hard to iterate the zones from a linux kernel module, which * makes it difficult to determine the watermark dynamically. Instead * we compute the maximum high watermark for this system, based - * on the amount of memory, assuming default parameters on Linux kernel - * 5.3. + * on the amount of memory, using the same method as the kernel uses + * to calculate its internal `min_free_kbytes` variable. See + * torvalds/linux@ee8eb9a5fe86 for the change in the upper clamp value + * from 64M to 256M. */ /* * Base wmark_low is 4 * the square root of Kbytes of RAM. */ - long wmark = 4 * int_sqrt(allmem/1024) * 1024; + long wmark = int_sqrt(allmem / 1024 * 16) * 1024; /* - * Clamp to between 128K and 64MB. + * Clamp to between 128K and 256/64MB. */ wmark = MAX(wmark, 128 * 1024); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 7, 0) + wmark = MIN(wmark, 256 * 1024 * 1024); +#else wmark = MIN(wmark, 64 * 1024 * 1024); +#endif /* * watermark_boost can increase the wmark by up to 150%. @@ -348,7 +352,7 @@ arc_lowmem_init(void) * swapping out pages when it is preferable to shrink the arc. */ arc_shrinker = spl_register_shrinker("zfs-arc-shrinker", - arc_shrinker_count, arc_shrinker_scan, DEFAULT_SEEKS); + arc_shrinker_count, arc_shrinker_scan, zfs_arc_shrinker_seeks); VERIFY(arc_shrinker); arc_set_sys_free(allmem); @@ -449,3 +453,5 @@ arc_unregister_hotplug(void) ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_limit, INT, ZMOD_RW, "Limit on number of pages that ARC shrinker can reclaim at once"); +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_seeks, INT, ZMOD_RD, + "Relative cost of ARC eviction vs other kernel subsystems"); diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index 1f72cce07dd1..da0cda03985e 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -1179,6 +1179,63 @@ zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp) return (error); } +/* + * Dentry and inode caches referenced by a task in non-root memcg are + * not going to be scanned by the kernel-provided shrinker. So, if + * kernel prunes nothing, fall back to this manual walk to free dnodes. + * To avoid scanning the same znodes multiple times they are always rotated + * to the end of the z_all_znodes list. New znodes are inserted at the + * end of the list so we're always scanning the oldest znodes first. + */ +static int +zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan) +{ + znode_t **zp_array, *zp; + int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *)); + int objects = 0; + int i = 0, j = 0; + + zp_array = vmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP); + + mutex_enter(&zfsvfs->z_znodes_lock); + while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) { + + if ((i++ > nr_to_scan) || (j >= max_array)) + break; + + ASSERT(list_link_active(&zp->z_link_node)); + list_remove(&zfsvfs->z_all_znodes, zp); + list_insert_tail(&zfsvfs->z_all_znodes, zp); + + /* Skip active znodes and .zfs entries */ + if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir) + continue; + + if (igrab(ZTOI(zp)) == NULL) + continue; + + zp_array[j] = zp; + j++; + } + mutex_exit(&zfsvfs->z_znodes_lock); + + for (i = 0; i < j; i++) { + zp = zp_array[i]; + + ASSERT3P(zp, !=, NULL); + d_prune_aliases(ZTOI(zp)); + + if (atomic_read(&ZTOI(zp)->i_count) == 1) + objects++; + + zrele(zp); + } + + vmem_free(zp_array, max_array * sizeof (znode_t *)); + + return (objects); +} + /* * The ARC has requested that the filesystem drop entries from the dentry * and inode caches. This can occur when the ARC needs to free meta data @@ -1222,6 +1279,14 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects) *objects = (*shrinker->scan_objects)(shrinker, &sc); #endif + /* + * Fall back to zfs_prune_aliases if kernel's shrinker did nothing + * due to dentry and inode caches being referenced by a task running + * in non-root memcg. + */ + if (*objects == 0) + *objects = zfs_prune_aliases(zfsvfs, nr_to_scan); + zfs_exit(zfsvfs, FTAG); dprintf_ds(zfsvfs->z_os->os_dsl_dataset, diff --git a/module/os/linux/zfs/zpl_ctldir.c b/module/os/linux/zfs/zpl_ctldir.c index a7fdb8f28009..5d2728662fc1 100644 --- a/module/os/linux/zfs/zpl_ctldir.c +++ b/module/os/linux/zfs/zpl_ctldir.c @@ -197,7 +197,7 @@ zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags) return (!!dentry->d_inode); } -static dentry_operations_t zpl_dops_snapdirs = { +static const struct dentry_operations zpl_dops_snapdirs = { /* * Auto mounting of snapshots is only supported for 2.6.37 and * newer kernels. Prior to this kernel the ops->follow_link() @@ -210,6 +210,52 @@ static dentry_operations_t zpl_dops_snapdirs = { .d_revalidate = zpl_snapdir_revalidate, }; +/* + * For the .zfs control directory to work properly we must be able to override + * the default operations table and register custom .d_automount and + * .d_revalidate callbacks. + */ +static void +set_snapdir_dentry_ops(struct dentry *dentry, unsigned int extraflags) +{ + static const unsigned int op_flags = + DCACHE_OP_HASH | DCACHE_OP_COMPARE | + DCACHE_OP_REVALIDATE | DCACHE_OP_DELETE | + DCACHE_OP_PRUNE | DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_REAL; + +#ifdef HAVE_D_SET_D_OP + /* + * d_set_d_op() will set the DCACHE_OP_ flags according to what it + * finds in the passed dentry_operations, so we don't have to. + * + * We clear the flags and the old op table before calling d_set_d_op() + * because issues a warning when the dentry operations table is already + * set. + */ + dentry->d_op = NULL; + dentry->d_flags &= ~op_flags; + d_set_d_op(dentry, &zpl_dops_snapdirs); + dentry->d_flags |= extraflags; +#else + /* + * Since 6.17 there's no exported way to modify dentry ops, so we have + * to reach in and do it ourselves. This should be safe for our very + * narrow use case, which is to create or splice in an entry to give + * access to a snapshot. + * + * We need to set the op flags directly. We hardcode + * DCACHE_OP_REVALIDATE because that's the only operation we have; if + * we ever extend zpl_dops_snapdirs we will need to update the op flags + * to match. + */ + spin_lock(&dentry->d_lock); + dentry->d_op = &zpl_dops_snapdirs; + dentry->d_flags &= ~op_flags; + dentry->d_flags |= DCACHE_OP_REVALIDATE | extraflags; + spin_unlock(&dentry->d_lock); +#endif +} + static struct dentry * zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags) @@ -231,10 +277,7 @@ zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry, return (ERR_PTR(error)); ASSERT(error == 0 || ip == NULL); - d_clear_d_op(dentry); - d_set_d_op(dentry, &zpl_dops_snapdirs); - dentry->d_flags |= DCACHE_NEED_AUTOMOUNT; - + set_snapdir_dentry_ops(dentry, DCACHE_NEED_AUTOMOUNT); return (d_splice_alias(ip, dentry)); } @@ -368,8 +411,7 @@ zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0); if (error == 0) { - d_clear_d_op(dentry); - d_set_d_op(dentry, &zpl_dops_snapdirs); + set_snapdir_dentry_ops(dentry, 0); d_instantiate(dentry, ip); } diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index 7a1e7eee79de..0dd5d5068239 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -35,10 +35,7 @@ #include #include #include -#if defined(HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS) || \ - defined(HAVE_VFS_FILEMAP_DIRTY_FOLIO) -#include -#endif +#include #include #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO #include @@ -592,6 +589,7 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) return (result); } +#ifdef HAVE_VFS_WRITEPAGE /* * Write out dirty pages to the ARC, this function is only required to * support mmap(2). Mapped pages may be dirtied by memory operations @@ -608,6 +606,7 @@ zpl_writepage(struct page *pp, struct writeback_control *wbc) return (zpl_putpage(pp, wbc, &for_sync)); } +#endif /* * The flag combination which matches the behavior of zfs_space() is @@ -1083,7 +1082,9 @@ const struct address_space_operations zpl_address_space_operations = { #else .readpage = zpl_readpage, #endif +#ifdef HAVE_VFS_WRITEPAGE .writepage = zpl_writepage, +#endif .writepages = zpl_writepages, .direct_IO = zpl_direct_IO, #ifdef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 790babd3888e..5e99ae8a0daa 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -376,16 +376,14 @@ zvol_discard(zv_request_t *zvr) } /* - * Align the request to volume block boundaries when a secure erase is - * not required. This will prevent dnode_free_range() from zeroing out - * the unaligned parts which is slow (read-modify-write) and useless - * since we are not freeing any space by doing so. + * Align the request to volume block boundaries. This will prevent + * dnode_free_range() from zeroing out the unaligned parts which is + * slow (read-modify-write) and useless since we are not freeing any + * space by doing so. */ - if (!io_is_secure_erase(bio, rq)) { - start = P2ROUNDUP(start, zv->zv_volblocksize); - end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); - size = end - start; - } + start = P2ROUNDUP(start, zv->zv_volblocksize); + end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); + size = end - start; if (start >= end) goto unlock; @@ -505,6 +503,24 @@ zvol_read_task(void *arg) zv_request_task_free(task); } +/* + * Note: + * + * The kernel uses different enum names for the IO opcode, depending on the + * kernel version ('req_opf', 'req_op'). To sidestep this, use macros rather + * than inline functions for these checks. + */ +/* Should this IO go down the zvol write path? */ +#define ZVOL_OP_IS_WRITE(op) \ + (op == REQ_OP_WRITE || \ + op == REQ_OP_FLUSH || \ + op == REQ_OP_DISCARD) + +/* Is this IO type supported by zvols? */ +#define ZVOL_OP_IS_SUPPORTED(op) (op == REQ_OP_READ || ZVOL_OP_IS_WRITE(op)) + +/* Get the IO opcode */ +#define ZVOL_OP(bio, rq) (bio != NULL ? bio_op(bio) : req_op(rq)) /* * Process a BIO or request @@ -522,7 +538,33 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, fstrans_cookie_t cookie = spl_fstrans_mark(); uint64_t offset = io_offset(bio, rq); uint64_t size = io_size(bio, rq); - int rw = io_data_dir(bio, rq); + int rw; + + if (unlikely(!ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq)))) { + zfs_dbgmsg("Unsupported zvol %s, op=%d, flags=0x%x", + rq != NULL ? "request" : "BIO", + ZVOL_OP(bio, rq), + rq != NULL ? rq->cmd_flags : bio->bi_opf); + ASSERT(ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq))); + zvol_end_io(bio, rq, SET_ERROR(ENOTSUPP)); + goto out; + } + + if (ZVOL_OP_IS_WRITE(ZVOL_OP(bio, rq))) { + rw = WRITE; + } else { + rw = READ; + } + + /* + * Sanity check + * + * If we're a BIO, check our rw matches the kernel's + * bio_data_dir(bio) rw. We need to check because we support fewer + * IO operations, and want to verify that what we think are reads and + * writes from those operations match what the kernel thinks. + */ + ASSERT(rq != NULL || rw == bio_data_dir(bio)); if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { zvol_end_io(bio, rq, -SET_ERROR(ENXIO)); @@ -557,8 +599,8 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, #ifdef HAVE_BLK_MQ_RQ_HCTX blk_mq_hw_queue = rq->mq_hctx->queue_num; #else - blk_mq_hw_queue = - rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num; + blk_mq_hw_queue = rq->q->queue_hw_ctx[ + rq->q->mq_map[raw_smp_processor_id()]]->queue_num; #endif taskq_hash = cityhash4((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT, blk_mq_hw_queue, 0); @@ -627,7 +669,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, * interfaces lack this functionality (they block waiting for * the i/o to complete). */ - if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { + if (io_is_discard(bio, rq)) { if (force_sync) { zvol_discard(&zvr); } else { diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 5c6e92f0f8b3..3fc5a1537ef5 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -2597,7 +2597,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type) ARCSTAT_INCR(arcstat_bonus_size, space); break; case ARC_SPACE_DNODE: - ARCSTAT_INCR(arcstat_dnode_size, space); + aggsum_add(&arc_sums.arcstat_dnode_size, space); break; case ARC_SPACE_DBUF: ARCSTAT_INCR(arcstat_dbuf_size, space); @@ -2643,7 +2643,7 @@ arc_space_return(uint64_t space, arc_space_type_t type) ARCSTAT_INCR(arcstat_bonus_size, -space); break; case ARC_SPACE_DNODE: - ARCSTAT_INCR(arcstat_dnode_size, -space); + aggsum_add(&arc_sums.arcstat_dnode_size, -space); break; case ARC_SPACE_DBUF: ARCSTAT_INCR(arcstat_dbuf_size, -space); @@ -4240,6 +4240,18 @@ arc_evict_adj(uint64_t frac, uint64_t total, uint64_t up, uint64_t down, return (frac + up - down); } +/* + * Calculate (x * multiplier / divisor) without unnecesary overflows. + */ +static uint64_t +arc_mf(uint64_t x, uint64_t multiplier, uint64_t divisor) +{ + uint64_t q = (x / divisor); + uint64_t r = (x % divisor); + + return ((q * multiplier) + ((r * multiplier) / divisor)); +} + /* * Evict buffers from the cache, such that arcstat_size is capped by arc_c. */ @@ -4292,18 +4304,21 @@ arc_evict(void) * target is not evictable or if they go over arc_dnode_limit. */ int64_t prune = 0; - int64_t dn = wmsum_value(&arc_sums.arcstat_dnode_size); + int64_t dn = aggsum_value(&arc_sums.arcstat_dnode_size); + int64_t nem = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) + + zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]) + - zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) + - zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); w = wt * (int64_t)(arc_meta >> 16) >> 16; - if (zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) + - zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]) - - zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) - - zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]) > - w * 3 / 4) { + if (nem > w * 3 / 4) { prune = dn / sizeof (dnode_t) * zfs_arc_dnode_reduce_percent / 100; - } else if (dn > arc_dnode_limit) { - prune = (dn - arc_dnode_limit) / sizeof (dnode_t) * - zfs_arc_dnode_reduce_percent / 100; + if (nem < w && w > 4) + prune = arc_mf(prune, nem - w * 3 / 4, w / 4); + } + if (dn > arc_dnode_limit) { + prune = MAX(prune, (dn - arc_dnode_limit) / sizeof (dnode_t) * + zfs_arc_dnode_reduce_percent / 100); } if (prune > 0) arc_prune_async(prune); @@ -4404,13 +4419,14 @@ arc_flush(spa_t *spa, boolean_t retry) (void) arc_flush_state(arc_uncached, guid, ARC_BUFC_METADATA, retry); } -void -arc_reduce_target_size(int64_t to_free) +uint64_t +arc_reduce_target_size(uint64_t to_free) { - uint64_t c = arc_c; - - if (c <= arc_c_min) - return; + /* + * Get the actual arc size. Even if we don't need it, this updates + * the aggsum lower bound estimate for arc_is_overflowing(). + */ + uint64_t asize = aggsum_value(&arc_sums.arcstat_size); /* * All callers want the ARC to actually evict (at least) this much @@ -4420,16 +4436,28 @@ arc_reduce_target_size(int64_t to_free) * immediately have arc_c < arc_size and therefore the arc_evict_zthr * will evict. */ - uint64_t asize = aggsum_value(&arc_sums.arcstat_size); - if (asize < c) - to_free += c - asize; - arc_c = MAX((int64_t)c - to_free, (int64_t)arc_c_min); + uint64_t c = arc_c; + if (c > arc_c_min) { + c = MIN(c, MAX(asize, arc_c_min)); + to_free = MIN(to_free, c - arc_c_min); + arc_c = c - to_free; + } else { + to_free = 0; + } - /* See comment in arc_evict_cb_check() on why lock+flag */ - mutex_enter(&arc_evict_lock); - arc_evict_needed = B_TRUE; - mutex_exit(&arc_evict_lock); - zthr_wakeup(arc_evict_zthr); + /* + * Whether or not we reduced the target size, request eviction if the + * current size is over it now, since caller obviously wants some RAM. + */ + if (asize > arc_c) { + /* See comment in arc_evict_cb_check() on why lock+flag */ + mutex_enter(&arc_evict_lock); + arc_evict_needed = B_TRUE; + mutex_exit(&arc_evict_lock); + zthr_wakeup(arc_evict_zthr); + } + + return (to_free); } /* @@ -4632,9 +4660,9 @@ arc_reap_cb_check(void *arg, zthr_t *zthr) static void arc_reap_cb(void *arg, zthr_t *zthr) { - (void) arg, (void) zthr; + int64_t can_free, free_memory, to_free; - int64_t free_memory; + (void) arg, (void) zthr; fstrans_cookie_t cookie = spl_fstrans_mark(); /* @@ -4662,13 +4690,10 @@ arc_reap_cb(void *arg, zthr_t *zthr) * amount, reduce by what is needed to hit the fractional amount. */ free_memory = arc_available_memory(); - - int64_t can_free = arc_c - arc_c_min; - if (can_free > 0) { - int64_t to_free = (can_free >> arc_shrink_shift) - free_memory; - if (to_free > 0) - arc_reduce_target_size(to_free); - } + can_free = arc_c - arc_c_min; + to_free = (MAX(can_free, 0) >> arc_shrink_shift) - free_memory; + if (to_free > 0) + arc_reduce_target_size(to_free); spl_fstrans_unmark(cookie); } @@ -4756,16 +4781,11 @@ arc_adapt(uint64_t bytes) } /* - * Check if arc_size has grown past our upper threshold, determined by - * zfs_arc_overflow_shift. + * Check if ARC current size has grown past our upper thresholds. */ static arc_ovf_level_t -arc_is_overflowing(boolean_t use_reserve) +arc_is_overflowing(boolean_t lax, boolean_t use_reserve) { - /* Always allow at least one block of overflow */ - int64_t overflow = MAX(SPA_MAXBLOCKSIZE, - arc_c >> zfs_arc_overflow_shift); - /* * We just compare the lower bound here for performance reasons. Our * primary goals are to make sure that the arc never grows without @@ -4775,12 +4795,24 @@ arc_is_overflowing(boolean_t use_reserve) * in the ARC. In practice, that's in the tens of MB, which is low * enough to be safe. */ - int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) - - arc_c - overflow / 2; - if (!use_reserve) - overflow /= 2; - return (over < 0 ? ARC_OVF_NONE : - over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE); + int64_t arc_over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c - + zfs_max_recordsize; + int64_t dn_over = aggsum_lower_bound(&arc_sums.arcstat_dnode_size) - + arc_dnode_limit; + + /* Always allow at least one block of overflow. */ + if (arc_over < 0 && dn_over <= 0) + return (ARC_OVF_NONE); + + /* If we are under memory pressure, report severe overflow. */ + if (!lax) + return (ARC_OVF_SEVERE); + + /* We are not under pressure, so be more or less relaxed. */ + int64_t overflow = (arc_c >> zfs_arc_overflow_shift) / 2; + if (use_reserve) + overflow *= 3; + return (arc_over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE); } static abd_t * @@ -4812,15 +4844,17 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, const void *tag) /* * Wait for the specified amount of data (in bytes) to be evicted from the - * ARC, and for there to be sufficient free memory in the system. Waiting for - * eviction ensures that the memory used by the ARC decreases. Waiting for - * free memory ensures that the system won't run out of free pages, regardless - * of ARC behavior and settings. See arc_lowmem_init(). + * ARC, and for there to be sufficient free memory in the system. + * The lax argument specifies that caller does not have a specific reason + * to wait, not aware of any memory pressure. Low memory handlers though + * should set it to B_FALSE to wait for all required evictions to complete. + * The use_reserve argument allows some callers to wait less than others + * to not block critical code paths, possibly blocking other resources. */ void -arc_wait_for_eviction(uint64_t amount, boolean_t use_reserve) +arc_wait_for_eviction(uint64_t amount, boolean_t lax, boolean_t use_reserve) { - switch (arc_is_overflowing(use_reserve)) { + switch (arc_is_overflowing(lax, use_reserve)) { case ARC_OVF_NONE: return; case ARC_OVF_SOME: @@ -4915,7 +4949,7 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag, * under arc_c. See the comment above zfs_arc_eviction_pct. */ arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100, - alloc_flags & ARC_HDR_USE_RESERVE); + B_TRUE, alloc_flags & ARC_HDR_USE_RESERVE); arc_buf_contents_t type = arc_buf_type(hdr); if (type == ARC_BUFC_METADATA) { @@ -6938,7 +6972,7 @@ arc_kstat_update(kstat_t *ksp, int rw) #if defined(COMPAT_FREEBSD11) as->arcstat_other_size.value.ui64 = wmsum_value(&arc_sums.arcstat_bonus_size) + - wmsum_value(&arc_sums.arcstat_dnode_size) + + aggsum_value(&arc_sums.arcstat_dnode_size) + wmsum_value(&arc_sums.arcstat_dbuf_size); #endif @@ -6980,7 +7014,7 @@ arc_kstat_update(kstat_t *ksp, int rw) &as->arcstat_uncached_evictable_metadata); as->arcstat_dnode_size.value.ui64 = - wmsum_value(&arc_sums.arcstat_dnode_size); + aggsum_value(&arc_sums.arcstat_dnode_size); as->arcstat_bonus_size.value.ui64 = wmsum_value(&arc_sums.arcstat_bonus_size); as->arcstat_l2_hits.value.ui64 = @@ -7349,7 +7383,7 @@ arc_state_init(void) wmsum_init(&arc_sums.arcstat_data_size, 0); wmsum_init(&arc_sums.arcstat_metadata_size, 0); wmsum_init(&arc_sums.arcstat_dbuf_size, 0); - wmsum_init(&arc_sums.arcstat_dnode_size, 0); + aggsum_init(&arc_sums.arcstat_dnode_size, 0); wmsum_init(&arc_sums.arcstat_bonus_size, 0); wmsum_init(&arc_sums.arcstat_l2_hits, 0); wmsum_init(&arc_sums.arcstat_l2_misses, 0); @@ -7507,7 +7541,7 @@ arc_state_fini(void) wmsum_fini(&arc_sums.arcstat_data_size); wmsum_fini(&arc_sums.arcstat_metadata_size); wmsum_fini(&arc_sums.arcstat_dbuf_size); - wmsum_fini(&arc_sums.arcstat_dnode_size); + aggsum_fini(&arc_sums.arcstat_dnode_size); wmsum_fini(&arc_sums.arcstat_bonus_size); wmsum_fini(&arc_sums.arcstat_l2_hits); wmsum_fini(&arc_sums.arcstat_l2_misses); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 4dcc271c9df4..263c21d412b1 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -841,8 +841,16 @@ dbuf_evict_notify(uint64_t size) * and grabbing the lock results in massive lock contention. */ if (size > dbuf_cache_target_bytes()) { - if (size > dbuf_cache_hiwater_bytes()) + /* + * Avoid calling dbuf_evict_one() from memory reclaim context + * (e.g. Linux kswapd, FreeBSD pagedaemon) to prevent deadlocks. + * Memory reclaim threads can get stuck waiting for the dbuf + * hash lock. + */ + if (size > dbuf_cache_hiwater_bytes() && + !current_is_reclaim_thread()) { dbuf_evict_one(); + } cv_signal(&dbuf_evict_cv); } } diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 33f1cf948552..036c974c7ddc 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2170,7 +2170,8 @@ int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) { dnode_t *dn; - int restarted = 0, err; + uint64_t txg, maxtxg = 0; + int err; restart: err = dnode_hold(os, object, FTAG, &dn); @@ -2186,19 +2187,22 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) * must be synced to disk to accurately report holes. * * Provided a RL_READER rangelock spanning 0-UINT64_MAX is - * held by the caller only a single restart will be required. + * held by the caller only limited restarts will be required. * We tolerate callers which do not hold the rangelock by - * returning EBUSY and not reporting holes after one restart. + * returning EBUSY and not reporting holes after at most + * TXG_CONCURRENT_STATES (3) restarts. */ if (zfs_dmu_offset_next_sync) { rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); - if (restarted) + if (maxtxg == 0) { + txg = spa_last_synced_txg(dmu_objset_spa(os)); + maxtxg = txg + TXG_CONCURRENT_STATES; + } else if (txg >= maxtxg) return (SET_ERROR(EBUSY)); - txg_wait_synced(dmu_objset_pool(os), 0); - restarted = 1; + txg_wait_synced(dmu_objset_pool(os), ++txg); goto restart; } diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 227a2c98705a..67f8fd972ff8 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -2615,6 +2615,32 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, return (error); } +/* + * Adjust *offset to the next (or previous) block byte offset at lvl. + * Returns FALSE if *offset would overflow or underflow. + */ +static boolean_t +dnode_next_block(dnode_t *dn, int flags, uint64_t *offset, int lvl) +{ + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + int span = lvl * epbs + dn->dn_datablkshift; + uint64_t blkid, maxblkid; + + if (span >= 8 * sizeof (uint64_t)) + return (B_FALSE); + + blkid = *offset >> span; + maxblkid = 1ULL << (8 * sizeof (*offset) - span); + if (!(flags & DNODE_FIND_BACKWARDS) && blkid + 1 < maxblkid) + *offset = (blkid + 1) << span; + else if ((flags & DNODE_FIND_BACKWARDS) && blkid > 0) + *offset = (blkid << span) - 1; + else + return (B_FALSE); + + return (B_TRUE); +} + /* * Find the next hole, data, or sparse region at or after *offset. * The value 'blkfill' tells us how many items we expect to find @@ -2642,7 +2668,7 @@ int dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, int minlvl, uint64_t blkfill, uint64_t txg) { - uint64_t initial_offset = *offset; + uint64_t matched = *offset; int lvl, maxlvl; int error = 0; @@ -2666,16 +2692,36 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, maxlvl = dn->dn_phys->dn_nlevels; - for (lvl = minlvl; lvl <= maxlvl; lvl++) { + for (lvl = minlvl; lvl <= maxlvl; ) { error = dnode_next_offset_level(dn, flags, offset, lvl, blkfill, txg); - if (error != ESRCH) + if (error == 0 && lvl > minlvl) { + --lvl; + matched = *offset; + } else if (error == ESRCH && lvl < maxlvl && + dnode_next_block(dn, flags, &matched, lvl)) { + /* + * Continue search at next/prev offset in lvl+1 block. + * + * Usually we only search upwards at the start of the + * search as higher level blocks point at a matching + * minlvl block in most cases, but we backtrack if not. + * + * This can happen for txg > 0 searches if the block + * contains only BPs/dnodes freed at that txg. It also + * happens if we are still syncing out the tree, and + * some BP's at higher levels are not updated yet. + * + * We must adjust offset to avoid coming back to the + * same offset and getting stuck looping forever. This + * also deals with the case where offset is already at + * the beginning or end of the object. + */ + ++lvl; + *offset = matched; + } else { break; - } - - while (error == 0 && --lvl >= minlvl) { - error = dnode_next_offset_level(dn, - flags, offset, lvl, blkfill, txg); + } } /* @@ -2687,9 +2733,6 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, error = 0; } - if (error == 0 && (flags & DNODE_FIND_BACKWARDS ? - initial_offset < *offset : initial_offset > *offset)) - error = SET_ERROR(ESRCH); out: if (!(flags & DNODE_FIND_HAVELOCK)) rw_exit(&dn->dn_struct_rwlock); diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index a77874ea0dd3..41ab5162c94c 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -460,6 +460,8 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)); fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, spa->spa_errata); + fnvlist_add_uint64(config, ZPOOL_CONFIG_MIN_ALLOC, spa->spa_min_alloc); + fnvlist_add_uint64(config, ZPOOL_CONFIG_MAX_ALLOC, spa->spa_max_alloc); if (spa->spa_comment != NULL) fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT, spa->spa_comment); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 649fe2f634b5..d2983ba15313 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -775,6 +775,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_min_ashift = INT_MAX; spa->spa_max_ashift = 0; spa->spa_min_alloc = INT_MAX; + spa->spa_max_alloc = 0; spa->spa_gcd_alloc = INT_MAX; /* Reset cached value */ @@ -1796,6 +1797,19 @@ spa_get_worst_case_asize(spa_t *spa, uint64_t lsize) return (MAX(lsize, 1 << spa->spa_max_ashift) * spa_asize_inflation); } +/* + * Return the range of minimum allocation sizes for the normal allocation + * class. This can be used by external consumers of the DMU to estimate + * potential wasted capacity when setting the recordsize for an object. + * This is mainly for dRAID pools which always pad to a full stripe width. + */ +void +spa_get_min_alloc_range(spa_t *spa, uint64_t *min_alloc, uint64_t *max_alloc) +{ + *min_alloc = spa->spa_min_alloc; + *max_alloc = spa->spa_max_alloc; +} + /* * Return the amount of slop space in bytes. It is typically 1/32 of the pool * (3.2%), minus the embedded log space. On very small pools, it may be @@ -2980,6 +2994,7 @@ EXPORT_SYMBOL(spa_version); EXPORT_SYMBOL(spa_state); EXPORT_SYMBOL(spa_load_state); EXPORT_SYMBOL(spa_freeze_txg); +EXPORT_SYMBOL(spa_get_min_alloc_range); /* for Lustre */ EXPORT_SYMBOL(spa_get_dspace); EXPORT_SYMBOL(spa_update_dspace); EXPORT_SYMBOL(spa_deflate); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 82763c125caa..7d6d4606d2cf 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1428,12 +1428,14 @@ vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc) { if (min_alloc < spa->spa_min_alloc) spa->spa_min_alloc = min_alloc; - if (spa->spa_gcd_alloc == INT_MAX) { + + if (min_alloc > spa->spa_max_alloc) + spa->spa_max_alloc = min_alloc; + + if (spa->spa_gcd_alloc == INT_MAX) spa->spa_gcd_alloc = min_alloc; - } else { - spa->spa_gcd_alloc = vdev_gcd(min_alloc, - spa->spa_gcd_alloc); - } + else + spa->spa_gcd_alloc = vdev_gcd(min_alloc, spa->spa_gcd_alloc); } void @@ -1487,8 +1489,7 @@ vdev_metaslab_group_create(vdev_t *vd) if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; - uint64_t min_alloc = vdev_get_min_alloc(vd); - vdev_spa_set_alloc(spa, min_alloc); + vdev_spa_set_alloc(spa, vdev_get_min_alloc(vd)); } } } diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 5c0e750c4614..a64cd1690684 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -498,6 +498,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift); fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, vd->vdev_asize); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_MIN_ALLOC, + vdev_get_min_alloc(vd)); fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog); if (vd->vdev_noalloc) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_NONALLOCATING, diff --git a/module/zfs/zfs_chksum.c b/module/zfs/zfs_chksum.c index acedeab7a163..dd25e9e02187 100644 --- a/module/zfs/zfs_chksum.c +++ b/module/zfs/zfs_chksum.c @@ -31,9 +31,6 @@ #include #include -/* limit benchmarking to max 256KiB, when EdonR is slower then this: */ -#define LIMIT_PERF_MBS 300 - typedef struct { const char *name; const char *impl; @@ -51,9 +48,15 @@ typedef struct { zio_checksum_tmpl_free_t *(free); } chksum_stat_t; +#define AT_STARTUP 0 +#define AT_BENCHMARK 1 +#define AT_DONE 2 + static chksum_stat_t *chksum_stat_data = 0; -static int chksum_stat_cnt = 0; static kstat_t *chksum_kstat = NULL; +static int chksum_stat_limit = AT_STARTUP; +static int chksum_stat_cnt = 0; +static void chksum_benchmark(void); /* * Sample output on i3-1005G1 System: @@ -128,6 +131,9 @@ chksum_kstat_data(char *buf, size_t size, void *data) static void * chksum_kstat_addr(kstat_t *ksp, loff_t n) { + /* full benchmark */ + chksum_benchmark(); + if (n < chksum_stat_cnt) ksp->ks_private = (void *)(chksum_stat_data + n); else @@ -175,47 +181,36 @@ chksum_run(chksum_stat_t *cs, abd_t *abd, void *ctx, int round, kpreempt_enable(); run_bw = size * run_count * NANOSEC; - run_bw /= run_time_ns; /* B/s */ + run_bw /= run_time_ns; /* B/s */ *result = run_bw/1024/1024; /* MiB/s */ } -#define LIMIT_INIT 0 -#define LIMIT_NEEDED 1 -#define LIMIT_NOLIMIT 2 - static void chksum_benchit(chksum_stat_t *cs) { abd_t *abd; void *ctx = 0; void *salt = &cs->salt.zcs_bytes; - static int chksum_stat_limit = LIMIT_INIT; memset(salt, 0, sizeof (cs->salt.zcs_bytes)); if (cs->init) ctx = cs->init(&cs->salt); + /* benchmarks in startup mode */ + if (chksum_stat_limit == AT_STARTUP) { + abd = abd_alloc_linear(1<<18, B_FALSE); + chksum_run(cs, abd, ctx, 5, &cs->bs256k); + goto done; + } + /* allocate test memory via abd linear interface */ abd = abd_alloc_linear(1<<20, B_FALSE); + + /* benchmarks when requested */ chksum_run(cs, abd, ctx, 1, &cs->bs1k); chksum_run(cs, abd, ctx, 2, &cs->bs4k); chksum_run(cs, abd, ctx, 3, &cs->bs16k); chksum_run(cs, abd, ctx, 4, &cs->bs64k); - chksum_run(cs, abd, ctx, 5, &cs->bs256k); - - /* check if we ran on a slow cpu */ - if (chksum_stat_limit == LIMIT_INIT) { - if (cs->bs1k < LIMIT_PERF_MBS) { - chksum_stat_limit = LIMIT_NEEDED; - } else { - chksum_stat_limit = LIMIT_NOLIMIT; - } - } - - /* skip benchmarks >= 1MiB when the CPU is to slow */ - if (chksum_stat_limit == LIMIT_NEEDED) - goto abort; - chksum_run(cs, abd, ctx, 6, &cs->bs1m); abd_free(abd); @@ -224,7 +219,7 @@ chksum_benchit(chksum_stat_t *cs) chksum_run(cs, abd, ctx, 7, &cs->bs4m); chksum_run(cs, abd, ctx, 8, &cs->bs16m); -abort: +done: abd_free(abd); /* free up temp memory */ @@ -242,7 +237,6 @@ chksum_benchmark(void) /* we need the benchmark only for the kernel module */ return; #endif - chksum_stat_t *cs; uint64_t max; uint32_t id, cbid = 0, id_save; @@ -250,8 +244,14 @@ chksum_benchmark(void) const zfs_impl_t *sha256 = zfs_impl_get_ops("sha256"); const zfs_impl_t *sha512 = zfs_impl_get_ops("sha512"); + /* benchmarks are done */ + if (chksum_stat_limit == AT_DONE) + return; + + /* count implementations */ - chksum_stat_cnt = 2; + chksum_stat_cnt = 1; /* edonr */ + chksum_stat_cnt += 1; /* skein */ chksum_stat_cnt += sha256->getcnt(); chksum_stat_cnt += sha512->getcnt(); chksum_stat_cnt += blake3->getcnt(); @@ -331,6 +331,17 @@ chksum_benchmark(void) } } blake3->setid(id_save); + + switch (chksum_stat_limit) { + case AT_STARTUP: + /* next time we want a full benchmark */ + chksum_stat_limit = AT_BENCHMARK; + break; + case AT_BENCHMARK: + /* no further benchmarks */ + chksum_stat_limit = AT_DONE; + break; + } } void @@ -340,7 +351,7 @@ chksum_init(void) blake3_per_cpu_ctx_init(); #endif - /* Benchmark supported implementations */ + /* 256KiB benchmark */ chksum_benchmark(); /* Install kstats for all implementations */ diff --git a/scripts/.gitignore b/scripts/.gitignore index 5621a6e147a0..443cb7b8484e 100644 --- a/scripts/.gitignore +++ b/scripts/.gitignore @@ -1 +1,2 @@ common.sh +objtool-wrapper diff --git a/scripts/objtool-wrapper.in b/scripts/objtool-wrapper.in new file mode 100644 index 000000000000..0451f8718233 --- /dev/null +++ b/scripts/objtool-wrapper.in @@ -0,0 +1,36 @@ +#!/bin/sh + +# SPDX-License-Identifier: MIT +# +# Copyright (c) 2025 Attila Fülöp +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +# Filter out objtools '--Werror' flag. + +objtool="@abs_objtool_binary@" +args=$(echo "$*" | sed s/--Werror//) + +if [ -z "$objtool" ]; then + echo "$(basename "$0"): No objtool binary configured" 1>&2 + exit 1; +fi + +# shellcheck disable=SC2086 +exec "$objtool" $args diff --git a/scripts/zfs-tests.sh b/scripts/zfs-tests.sh index 2906d73442c2..15d3a0eb9503 100755 --- a/scripts/zfs-tests.sh +++ b/scripts/zfs-tests.sh @@ -37,6 +37,7 @@ DEBUG="" CLEANUP="yes" CLEANUPALL="no" KMSG="" +TIMEOUT_DEBUG="" LOOPBACK="yes" STACK_TRACER="no" FILESIZE="4G" @@ -363,6 +364,7 @@ OPTIONS: -k Disable cleanup after test failure -K Log test names to /dev/kmsg -f Use files only, disables block device tests + -O Dump debugging info to /dev/kmsg on test timeout -S Enable stack tracer (negative performance impact) -c Only create and populate constrained path -R Automatically rerun failing tests @@ -401,7 +403,7 @@ $0 -x EOF } -while getopts 'hvqxkKfScRmn:d:Ds:r:?t:T:u:I:' OPTION; do +while getopts 'hvqxkKfScRmOn:d:Ds:r:?t:T:u:I:' OPTION; do case $OPTION in h) usage @@ -444,6 +446,9 @@ while getopts 'hvqxkKfScRmn:d:Ds:r:?t:T:u:I:' OPTION; do export NFS=1 . "$nfsfile" ;; + O) + TIMEOUT_DEBUG="yes" + ;; d) FILEDIR="$OPTARG" ;; @@ -766,6 +771,7 @@ msg "${TEST_RUNNER}" \ "${DEBUG:+-D}" \ "${KMEMLEAK:+-m}" \ "${KMSG:+-K}" \ + "${TIMEOUT_DEBUG:+-O}" \ "-c \"${RUNFILES}\"" \ "-T \"${TAGS}\"" \ "-i \"${STF_SUITE}\"" \ @@ -776,6 +782,7 @@ msg "${TEST_RUNNER}" \ ${DEBUG:+-D} \ ${KMEMLEAK:+-m} \ ${KMSG:+-K} \ + ${TIMEOUT_DEBUG:+-O} \ -c "${RUNFILES}" \ -T "${TAGS}" \ -i "${STF_SUITE}" \ diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index b49e1979bcbc..d25ec1e41ff1 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -372,7 +372,8 @@ tags = ['functional', 'cli_root', 'zpool'] [tests/functional/cli_root/zpool_add] tests = ['zpool_add_001_pos', 'zpool_add_002_pos', 'zpool_add_003_pos', 'zpool_add_004_pos', 'zpool_add_006_pos', 'zpool_add_007_neg', - 'zpool_add_008_neg', 'zpool_add_009_neg', 'zpool_add_010_pos', + 'zpool_add_008_neg', 'zpool_add_009_neg', 'zpool_add_warn_create', + 'zpool_add_warn_degraded', 'zpool_add_warn_removal', 'add-o_ashift', 'add_prop_ashift', 'zpool_add_dryrun_output', 'zpool_add--allow-ashift-mismatch'] tags = ['functional', 'cli_root', 'zpool_add'] diff --git a/tests/test-runner/bin/test-runner.py.in b/tests/test-runner/bin/test-runner.py.in index ea01b473b590..835b3b35f547 100755 --- a/tests/test-runner/bin/test-runner.py.in +++ b/tests/test-runner/bin/test-runner.py.in @@ -32,6 +32,7 @@ from select import select from subprocess import PIPE from subprocess import Popen from subprocess import check_output +from subprocess import run from threading import Timer from time import time, CLOCK_MONOTONIC from os.path import exists @@ -185,6 +186,63 @@ User: %s ''' % (self.pathname, self.identifier, self.outputdir, self.timeout, self.user) def kill_cmd(self, proc, options, kmemleak, keyboard_interrupt=False): + + """ + We're about to kill a command due to a timeout. + If we're running with the -O option, then dump debug info about the + process with the highest CPU usage to /dev/kmsg (Linux only). This can + help debug the timeout. + + Debug info includes: + - 30 lines from 'top' + - /proc//stack output of process with highest CPU usage + - Last lines strace-ing process with highest CPU usage + """ + if exists("/dev/kmsg"): + c = """ +TOP_OUT="$(COLUMNS=160 top -b -n 1 | head -n 30)" +read -r PID CMD <<< $(echo "$TOP_OUT" | /usr/bin/awk \ +"/COMMAND/{ + print_next=1 + next +} +{ + if (print_next == 1) { + print \\$1\\" \\"\\$12 + exit + } +}") +echo "##### ZTS timeout debug #####" +echo "----- top -----" +echo "$TOP_OUT" +echo "----- /proc/$PID/stack ($CMD)) -----" +cat /proc/$PID/stack +echo "----- strace ($CMD) -----" +TMPFILE="$(mktemp --suffix=ZTS)" +/usr/bin/strace -k --stack-traces -p $PID &> "$TMPFILE" & +sleep 0.1 +killall strace +tail -n 30 $TMPFILE +rm "$TMPFILE" +echo "##### /proc/sysrq-trigger stack #####" +""" + c = "sudo bash -c '" + c + "'" + data = run(c, capture_output=True, shell=True, text=True) + out = data.stdout + try: + kp = Popen([SUDO, "sh", "-c", + "echo '" + out + "' > /dev/kmsg"]) + kp.wait() + + """ + Trigger kernel stack traces + """ + kp = Popen([SUDO, "sh", "-c", + "echo l > /proc/sysrq-trigger"]) + kp.wait() + except Exception: + pass + """ Kill a running command due to timeout, or ^C from the keyboard. If sudo is required, this user was verified previously. @@ -1097,6 +1155,9 @@ def parse_args(): parser.add_option('-o', action='callback', callback=options_cb, default=BASEDIR, dest='outputdir', type='string', metavar='outputdir', help='Specify an output directory.') + parser.add_option('-O', action='store_true', default=False, + dest='timeout_debug', + help='Dump debugging info to /dev/kmsg on test timeout') parser.add_option('-i', action='callback', callback=options_cb, default=TESTDIR, dest='testdir', type='string', metavar='testdir', help='Specify a test directory.') diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index f9005769cff2..bb6c9bad04b3 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1001,7 +1001,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_add/zpool_add_007_neg.ksh \ functional/cli_root/zpool_add/zpool_add_008_neg.ksh \ functional/cli_root/zpool_add/zpool_add_009_neg.ksh \ - functional/cli_root/zpool_add/zpool_add_010_pos.ksh \ + functional/cli_root/zpool_add/zpool_add_warn_create.ksh \ + functional/cli_root/zpool_add/zpool_add_warn_degraded.ksh \ + functional/cli_root/zpool_add/zpool_add_warn_removal.ksh \ functional/cli_root/zpool_add/zpool_add_dryrun_output.ksh \ functional/cli_root/zpool_attach/attach-o_ashift.ksh \ functional/cli_root/zpool_attach/cleanup.ksh \ diff --git a/tests/zfs-tests/tests/functional/acl/off/posixmode.ksh b/tests/zfs-tests/tests/functional/acl/off/posixmode.ksh index df278ae2366c..2028265e0891 100755 --- a/tests/zfs-tests/tests/functional/acl/off/posixmode.ksh +++ b/tests/zfs-tests/tests/functional/acl/off/posixmode.ksh @@ -130,7 +130,7 @@ function test_posix_mode # base } # Sanity check on tmpfs first -tmpdir=$(TMPDIR=$TEST_BASE_DIR mktemp -d) +tmpdir=$(mktemp -d) log_must mount -t tmpfs tmp $tmpdir log_must chmod 777 $tmpdir diff --git a/tests/zfs-tests/tests/functional/arc/dbufstats_001_pos.ksh b/tests/zfs-tests/tests/functional/arc/dbufstats_001_pos.ksh index e51cf179d8ef..ea1922f8bbe4 100755 --- a/tests/zfs-tests/tests/functional/arc/dbufstats_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/arc/dbufstats_001_pos.ksh @@ -40,8 +40,8 @@ # dbufstat and the dbufs kstat output # -DBUFSTATS_FILE=$(mktemp $TEST_BASE_DIR/dbufstats.out.XXXXXX) -DBUFS_FILE=$(mktemp $TEST_BASE_DIR/dbufs.out.XXXXXX) +DBUFSTATS_FILE=$(mktemp -t dbufstats.out.XXXXXX) +DBUFS_FILE=$(mktemp -t dbufs.out.XXXXXX) function cleanup { diff --git a/tests/zfs-tests/tests/functional/arc/dbufstats_002_pos.ksh b/tests/zfs-tests/tests/functional/arc/dbufstats_002_pos.ksh index 2908895d0c6a..3db22443bdfa 100755 --- a/tests/zfs-tests/tests/functional/arc/dbufstats_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/arc/dbufstats_002_pos.ksh @@ -42,7 +42,7 @@ # 8. Ensure that at least some dbufs moved to the mfu list in the ARC # -DBUFS_FILE=$(mktemp $TEST_BASE_DIR/dbufs.out.XXXXXX) +DBUFS_FILE=$(mktemp -t dbufs.out.XXXXXX) function cleanup { diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_007_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_007_pos.ksh index 15760398127c..3a0382867de6 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_007_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_007_pos.ksh @@ -48,8 +48,8 @@ function cleanup log_assert "Verify that 'zfs send' drills appropriate holes" log_onexit cleanup -streamfile=$(mktemp $TESTDIR/file.XXXXXX) -vdev=$(mktemp $TEST_BASE_DIR/file.XXXXXX) +streamfile=$(mktemp) +vdev=$(mktemp) function test_pool diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.kshlib b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.kshlib index cc850e3e451f..8631319c04e4 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.kshlib @@ -26,6 +26,7 @@ # # Copyright (c) 2012, 2016 by Delphix. All rights reserved. +# Copyright 2025 by Lawrence Livermore National Security, LLC. # . $STF_SUITE/include/libtest.shlib @@ -88,3 +89,44 @@ function save_dump_dev fi echo $dumpdev } + +function zpool_create_add_setup +{ + typeset -i i=0 + + while ((i < 10)); do + log_must truncate -s $MINVDEVSIZE $TEST_BASE_DIR/vdev$i + + eval vdev$i=$TEST_BASE_DIR/vdev$i + ((i += 1)) + done + + if is_linux; then + vdev_lo="$(losetup -f "$vdev4" --show)" + elif is_freebsd; then + vdev_lo=/dev/"$(mdconfig -a -t vnode -f "$vdev4")" + else + vdev_lo="$(lofiadm -a "$vdev4")" + fi +} + +function zpool_create_add_cleanup +{ + datasetexists $TESTPOOL1 && destroy_pool $TESTPOOL1 + + if [[ -e $vdev_lo ]]; then + if is_linux; then + log_must losetup -d "$vdev_lo" + elif is_freebsd; then + log_must mdconfig -d -u "$vdev_lo" + else + log_must lofiadm -d "$vdev_lo" + fi + fi + + typeset -i i=0 + while ((i < 10)); do + rm -f $TEST_BASE_DIR/vdev$i + ((i += 1)) + done +} diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_create.ksh similarity index 56% rename from tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh rename to tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_create.ksh index 22860e9caf1d..c1e815127330 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_create.ksh @@ -22,67 +22,45 @@ # # Copyright 2009 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. -# - -# -# Copyright (c) 2012, 2016 by Delphix. All rights reserved. +# Copyright 2012, 2016 by Delphix. All rights reserved. +# Copyright 2025 by Lawrence Livermore National Security, LLC. # . $STF_SUITE/include/libtest.shlib -. $STF_SUITE/tests/functional/cli_root/zpool_create/zpool_create.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_add/zpool_add.kshlib # # DESCRIPTION: -# Verify zpool add succeed when adding vdevs with matching redundancy. +# Verify zpool add succeeds when adding vdevs with matching redundancy +# and warns with differing redundancy for a healthy pool. # # STRATEGY: # 1. Create several files == $MINVDEVSIZE. # 2. Verify 'zpool add' succeeds with matching redundancy. # 3. Verify 'zpool add' warns with differing redundancy. -# 4. Verify 'zpool add' warns with differing redundancy after removal. # verify_runnable "global" -function cleanup -{ - datasetexists $TESTPOOL1 && destroy_pool $TESTPOOL1 - - typeset -i i=0 - while ((i < 10)); do - rm -f $TEST_BASE_DIR/vdev$i - ((i += 1)) - done -} - +log_assert "Verify 'zpool add' warns for differing redundancy." +log_onexit zpool_create_add_cleanup -log_assert "Verify 'zpool add' succeed with keywords combination." -log_onexit cleanup +zpool_create_add_setup -# 1. Create several files == $MINVDEVSIZE. typeset -i i=0 -while ((i < 10)); do - log_must truncate -s $MINVDEVSIZE $TEST_BASE_DIR/vdev$i - - eval vdev$i=$TEST_BASE_DIR/vdev$i - ((i += 1)) -done +typeset -i j=0 set -A redundancy0_create_args \ "$vdev0" set -A redundancy1_create_args \ - "mirror $vdev0 $vdev1" \ - "raidz1 $vdev0 $vdev1" + "mirror $vdev0 $vdev1" set -A redundancy2_create_args \ - "mirror $vdev0 $vdev1 $vdev2" \ - "raidz2 $vdev0 $vdev1 $vdev2" + "mirror $vdev0 $vdev1 $vdev2" set -A redundancy3_create_args \ - "mirror $vdev0 $vdev1 $vdev2 $vdev3" \ - "raidz3 $vdev0 $vdev1 $vdev2 $vdev3" + "mirror $vdev0 $vdev1 $vdev2 $vdev3" set -A redundancy0_add_args \ "$vdev5" \ @@ -91,22 +69,13 @@ set -A redundancy0_add_args \ set -A redundancy1_add_args \ "mirror $vdev5 $vdev6" \ "raidz1 $vdev5 $vdev6" \ - "raidz1 $vdev5 $vdev6 mirror $vdev7 $vdev8" \ - "mirror $vdev5 $vdev6 raidz1 $vdev7 $vdev8" + "raidz1 $vdev5 $vdev6 mirror $vdev7 $vdev8" set -A redundancy2_add_args \ - "mirror $vdev5 $vdev6 $vdev7" \ - "raidz2 $vdev5 $vdev6 $vdev7" + "mirror $vdev5 $vdev6 $vdev7" set -A redundancy3_add_args \ - "mirror $vdev5 $vdev6 $vdev7 $vdev8" \ - "raidz3 $vdev5 $vdev6 $vdev7 $vdev8" - -set -A log_args "log" "$vdev4" -set -A cache_args "cache" "$vdev4" -set -A spare_args "spare" "$vdev4" - -typeset -i j=0 + "mirror $vdev5 $vdev6 $vdev7 $vdev8" function zpool_create_add { @@ -147,30 +116,6 @@ function zpool_create_forced_add done } -function zpool_create_rm_add -{ - typeset -n create_args=$1 - typeset -n add_args=$2 - typeset -n rm_args=$3 - - i=0 - while ((i < ${#create_args[@]})); do - j=0 - while ((j < ${#add_args[@]})); do - log_must zpool create $TESTPOOL1 ${create_args[$i]} - log_must zpool add $TESTPOOL1 ${rm_args[0]} ${rm_args[1]} - log_must zpool add $TESTPOOL1 ${add_args[$j]} - log_must zpool remove $TESTPOOL1 ${rm_args[1]} - log_mustnot zpool add $TESTPOOL1 ${rm_args[1]} - log_must zpool add $TESTPOOL1 ${rm_args[0]} ${rm_args[1]} - log_must zpool destroy -f $TESTPOOL1 - - ((j += 1)) - done - ((i += 1)) - done -} - # 2. Verify 'zpool add' succeeds with matching redundancy. zpool_create_add redundancy0_create_args redundancy0_add_args zpool_create_add redundancy1_create_args redundancy1_add_args @@ -194,17 +139,4 @@ zpool_create_forced_add redundancy3_create_args redundancy0_add_args zpool_create_forced_add redundancy3_create_args redundancy1_add_args zpool_create_forced_add redundancy3_create_args redundancy2_add_args -# 4. Verify 'zpool add' warns with differing redundancy after removal. -zpool_create_rm_add redundancy1_create_args redundancy1_add_args log_args -zpool_create_rm_add redundancy2_create_args redundancy2_add_args log_args -zpool_create_rm_add redundancy3_create_args redundancy3_add_args log_args - -zpool_create_rm_add redundancy1_create_args redundancy1_add_args cache_args -zpool_create_rm_add redundancy2_create_args redundancy2_add_args cache_args -zpool_create_rm_add redundancy3_create_args redundancy3_add_args cache_args - -zpool_create_rm_add redundancy1_create_args redundancy1_add_args spare_args -zpool_create_rm_add redundancy2_create_args redundancy2_add_args spare_args -zpool_create_rm_add redundancy3_create_args redundancy3_add_args spare_args - -log_pass "'zpool add' succeed with keywords combination." +log_pass "Verify 'zpool add' warns for differing redundancy." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_degraded.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_degraded.ksh new file mode 100755 index 000000000000..ded9cb3c68f8 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_degraded.ksh @@ -0,0 +1,195 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Copyright 2012, 2016 by Delphix. All rights reserved. +# Copyright 2025 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_add/zpool_add.kshlib + +# +# DESCRIPTION: +# Verify zpool add succeeds when adding vdevs with matching redundancy +# and warns with differing redundancy for a degraded pool. +# +# STRATEGY: +# 1. Create several files == $MINVDEVSIZE. +# 2. Verify 'zpool add' succeeds with matching redundancy +# 3. Verify 'zpool add' warns with differing redundancy when +# a. Degraded pool with replaced mismatch vdev (file vs disk) +# b. Degraded pool dRAID distributed spare active +# c. Degraded pool hot spare active +# + +verify_runnable "global" + +log_assert "Verify 'zpool add' warns for differing redundancy." +log_onexit zpool_create_add_cleanup + +zpool_create_add_setup + +set -A redundancy1_create_args \ + "mirror $vdev0 $vdev1" \ + "raidz1 $vdev0 $vdev1" \ + "draid1:1s $vdev0 $vdev1 $vdev9" + +set -A redundancy2_create_args \ + "mirror $vdev0 $vdev1 $vdev2" \ + "raidz2 $vdev0 $vdev1 $vdev2" \ + "draid2:1s $vdev0 $vdev1 $vdev2 $vdev9" + +set -A redundancy3_create_args \ + "mirror $vdev0 $vdev1 $vdev2 $vdev3" \ + "raidz3 $vdev0 $vdev1 $vdev2 $vdev3" \ + "draid3:1s $vdev0 $vdev1 $vdev2 $vdev3 $vdev9" + +set -A redundancy1_add_args \ + "mirror $vdev5 $vdev6" + +set -A redundancy2_add_args \ + "mirror $vdev5 $vdev6 $vdev7" + +set -A redundancy3_add_args \ + "mirror $vdev5 $vdev6 $vdev7 $vdev8" + +set -A redundancy1_create_draid_args \ + "draid1:1s $vdev0 $vdev1 $vdev2" + +set -A redundancy2_create_draid_args \ + "draid2:1s $vdev0 $vdev1 $vdev2 $vdev3" + +set -A redundancy3_create_draid_args \ + "draid3:1s $vdev0 $vdev1 $vdev2 $vdev3 $vdev9" + +set -A redundancy1_create_spare_args \ + "mirror $vdev0 $vdev1 spare $vdev_lo" \ + "raidz1 $vdev0 $vdev1 spare $vdev_lo" \ + "draid1 $vdev0 $vdev1 spare $vdev_lo" + +set -A redundancy2_create_spare_args \ + "mirror $vdev0 $vdev1 $vdev2 spare $vdev_lo" \ + "raidz2 $vdev0 $vdev1 $vdev2 spare $vdev_lo" \ + "draid2 $vdev0 $vdev1 $vdev2 spare $vdev_lo" + +set -A redundancy3_create_spare_args \ + "mirror $vdev0 $vdev1 $vdev2 $vdev3 spare $vdev_lo" \ + "raidz3 $vdev0 $vdev1 $vdev2 $vdev3 spare $vdev_lo" \ + "draid3 $vdev0 $vdev1 $vdev2 $vdev3 spare $vdev_lo" + +set -A replace_args "$vdev1" "$vdev_lo" +set -A draid1_args "$vdev1" "draid1-0-0" +set -A draid2_args "$vdev1" "draid2-0-0" +set -A draid3_args "$vdev1" "draid3-0-0" + +typeset -i i=0 +typeset -i j=0 + +function zpool_create_degraded_add +{ + typeset -n create_args=$1 + typeset -n add_args=$2 + typeset -n rm_args=$3 + + i=0 + while ((i < ${#create_args[@]})); do + j=0 + while ((j < ${#add_args[@]})); do + log_must zpool create $TESTPOOL1 ${create_args[$i]} + log_must zpool offline -f $TESTPOOL1 ${rm_args[0]} + log_must zpool replace -w $TESTPOOL1 ${rm_args[0]} ${rm_args[1]} + log_must zpool add $TESTPOOL1 ${add_args[$j]} + log_must zpool destroy -f $TESTPOOL1 + log_must zpool labelclear -f ${rm_args[0]} + + ((j += 1)) + done + ((i += 1)) + done +} + +function zpool_create_forced_degraded_add +{ + typeset -n create_args=$1 + typeset -n add_args=$2 + typeset -n rm_args=$3 + + i=0 + while ((i < ${#create_args[@]})); do + j=0 + while ((j < ${#add_args[@]})); do + log_must zpool create $TESTPOOL1 ${create_args[$i]} + log_must zpool offline -f $TESTPOOL1 ${rm_args[0]} + log_must zpool replace -w $TESTPOOL1 ${rm_args[0]} ${rm_args[1]} + log_mustnot zpool add $TESTPOOL1 ${add_args[$j]} + log_must zpool add --allow-replication-mismatch $TESTPOOL1 ${add_args[$j]} + log_must zpool destroy -f $TESTPOOL1 + log_must zpool labelclear -f ${rm_args[0]} + + ((j += 1)) + done + ((i += 1)) + done +} + +# 2. Verify 'zpool add' succeeds with matching redundancy and a degraded pool. +zpool_create_degraded_add redundancy1_create_args redundancy1_add_args replace_args +zpool_create_degraded_add redundancy2_create_args redundancy2_add_args replace_args +zpool_create_degraded_add redundancy3_create_args redundancy3_add_args replace_args + +# 3. Verify 'zpool add' warns with differing redundancy and a degraded pool. +# +# a. Degraded pool with replaced mismatch vdev (file vs disk) +zpool_create_forced_degraded_add redundancy1_create_args redundancy2_add_args replace_args +zpool_create_forced_degraded_add redundancy1_create_args redundancy3_add_args replace_args + +zpool_create_forced_degraded_add redundancy2_create_args redundancy1_add_args replace_args +zpool_create_forced_degraded_add redundancy2_create_args redundancy3_add_args replace_args + +zpool_create_forced_degraded_add redundancy3_create_args redundancy1_add_args replace_args +zpool_create_forced_degraded_add redundancy3_create_args redundancy2_add_args replace_args + +# b. Degraded pool dRAID distributed spare active + +zpool_create_forced_degraded_add redundancy1_create_draid_args redundancy2_add_args draid1_args +zpool_create_forced_degraded_add redundancy1_create_draid_args redundancy3_add_args draid1_args + +zpool_create_forced_degraded_add redundancy2_create_draid_args redundancy1_add_args draid2_args +zpool_create_forced_degraded_add redundancy2_create_draid_args redundancy3_add_args draid2_args + +zpool_create_forced_degraded_add redundancy3_create_draid_args redundancy1_add_args draid3_args +zpool_create_forced_degraded_add redundancy3_create_draid_args redundancy2_add_args draid3_args + +# c. Degraded pool hot spare active +zpool_create_forced_degraded_add redundancy1_create_spare_args redundancy2_add_args replace_args +zpool_create_forced_degraded_add redundancy1_create_spare_args redundancy3_add_args replace_args + +zpool_create_forced_degraded_add redundancy2_create_spare_args redundancy1_add_args replace_args +zpool_create_forced_degraded_add redundancy2_create_spare_args redundancy3_add_args replace_args + +zpool_create_forced_degraded_add redundancy3_create_spare_args redundancy1_add_args replace_args +zpool_create_forced_degraded_add redundancy3_create_spare_args redundancy2_add_args replace_args + +log_pass "Verify 'zpool add' warns for differing redundancy." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_removal.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_removal.ksh new file mode 100755 index 000000000000..56feaf1f9965 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_removal.ksh @@ -0,0 +1,117 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Copyright 2012, 2016 by Delphix. All rights reserved. +# Copyright 2025 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_add/zpool_add.kshlib + +# +# DESCRIPTION: +# Verify zpool add succeeds when adding vdevs with matching redundancy +# and warns with differing redundancy after removal. +# +# STRATEGY: +# 1. Create several files == $MINVDEVSIZE. +# 2. Verify 'zpool add' warns with differing redundancy after removal. +# + +verify_runnable "global" + +log_assert "Verify 'zpool add' warns for differing redundancy." +log_onexit zpool_create_add_cleanup + +zpool_create_add_setup + +typeset -i i=0 +typeset -i j=0 + +set -A redundancy1_create_args \ + "mirror $vdev0 $vdev1" \ + "raidz1 $vdev0 $vdev1" \ + "draid1:1s $vdev0 $vdev1 $vdev9" + +set -A redundancy2_create_args \ + "mirror $vdev0 $vdev1 $vdev2" \ + "raidz2 $vdev0 $vdev1 $vdev2" \ + "draid2:1s $vdev0 $vdev1 $vdev2 $vdev9" + +set -A redundancy3_create_args \ + "mirror $vdev0 $vdev1 $vdev2 $vdev3" \ + "raidz3 $vdev0 $vdev1 $vdev2 $vdev3" \ + "draid3:1s $vdev0 $vdev1 $vdev2 $vdev3 $vdev9" + +set -A redundancy1_add_args \ + "mirror $vdev5 $vdev6" + +set -A redundancy2_add_args \ + "mirror $vdev5 $vdev6 $vdev7" + +set -A redundancy3_add_args \ + "mirror $vdev5 $vdev6 $vdev7 $vdev8" + +set -A log_args "log" "$vdev_lo" +set -A cache_args "cache" "$vdev_lo" +set -A spare_args "spare" "$vdev_lo" + + +function zpool_create_rm_add +{ + typeset -n create_args=$1 + typeset -n add_args=$2 + typeset -n rm_args=$3 + + i=0 + while ((i < ${#create_args[@]})); do + j=0 + while ((j < ${#add_args[@]})); do + log_must zpool create $TESTPOOL1 ${create_args[$i]} + log_must zpool add $TESTPOOL1 ${rm_args[0]} ${rm_args[1]} + log_must zpool add $TESTPOOL1 ${add_args[$j]} + log_must zpool remove $TESTPOOL1 ${rm_args[1]} + log_mustnot zpool add $TESTPOOL1 ${rm_args[1]} + log_must zpool add $TESTPOOL1 ${rm_args[0]} ${rm_args[1]} + log_must zpool destroy -f $TESTPOOL1 + + ((j += 1)) + done + ((i += 1)) + done +} + +# 2. Verify 'zpool add' warns with differing redundancy after removal. +zpool_create_rm_add redundancy1_create_args redundancy1_add_args log_args +zpool_create_rm_add redundancy2_create_args redundancy2_add_args log_args +zpool_create_rm_add redundancy3_create_args redundancy3_add_args log_args + +zpool_create_rm_add redundancy1_create_args redundancy1_add_args cache_args +zpool_create_rm_add redundancy2_create_args redundancy2_add_args cache_args +zpool_create_rm_add redundancy3_create_args redundancy3_add_args cache_args + +zpool_create_rm_add redundancy1_create_args redundancy1_add_args spare_args +zpool_create_rm_add redundancy2_create_args redundancy2_add_args spare_args +zpool_create_rm_add redundancy3_create_args redundancy3_add_args spare_args diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_rename_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_rename_001_pos.ksh index 4d6005d2cb02..ec794782375d 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_rename_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_rename_001_pos.ksh @@ -160,7 +160,7 @@ while (( i < ${#pools[*]} )); do ((i = i + 1)) done -VDEV_FILE=$(mktemp $TEST_BASE_DIR/tmp.XXXXXX) +VDEV_FILE=$(mktemp) log_must mkfile -n 128M $VDEV_FILE log_must zpool create overflow $VDEV_FILE diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_panic.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_panic.ksh index 032d1fb91a2e..c7b149087d56 100755 --- a/tests/zfs-tests/tests/functional/redacted_send/redacted_panic.ksh +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_panic.ksh @@ -28,7 +28,7 @@ typeset ds_name="panic" typeset sendfs="$POOL/$ds_name" typeset recvfs="$POOL2/$ds_name" typeset clone="$POOL/${ds_name}_clone" -typeset stream=$(mktemp $TEST_BASE_DIR/stream.XXXX) +typeset stream=$(mktemp -t stream.XXXX) function cleanup { diff --git a/tests/zfs-tests/tests/functional/snapshot/snapshot_002_pos.ksh b/tests/zfs-tests/tests/functional/snapshot/snapshot_002_pos.ksh index d9e44d332e05..4c78e3ab2ee9 100755 --- a/tests/zfs-tests/tests/functional/snapshot/snapshot_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/snapshot/snapshot_002_pos.ksh @@ -63,7 +63,7 @@ function cleanup log_assert "Verify an archive of a file system is identical to " \ "an archive of its snapshot." -SNAPSHOT_TARDIR="$(mktemp -d /tmp/zfstests_snapshot_002.XXXXXX)" +SNAPSHOT_TARDIR="$(mktemp -t -d zfstests_snapshot_002.XXXXXX)" log_onexit cleanup typeset -i COUNT=21 diff --git a/tests/zfs-tests/tests/functional/snapshot/snapshot_006_pos.ksh b/tests/zfs-tests/tests/functional/snapshot/snapshot_006_pos.ksh index e34a50941470..392869bf83ad 100755 --- a/tests/zfs-tests/tests/functional/snapshot/snapshot_006_pos.ksh +++ b/tests/zfs-tests/tests/functional/snapshot/snapshot_006_pos.ksh @@ -72,7 +72,7 @@ function cleanup log_assert "Verify that an archive of a dataset is identical to " \ "an archive of the dataset's snapshot." -SNAPSHOT_TARDIR="$(mktemp -d /tmp/zfstests_snapshot_006.XXXXXX)" +SNAPSHOT_TARDIR="$(mktemp -t -d zfstests_snapshot_006.XXXXXX)" log_onexit cleanup typeset -i COUNT=21 diff --git a/tests/zfs-tests/tests/functional/user_namespace/user_namespace_004.ksh b/tests/zfs-tests/tests/functional/user_namespace/user_namespace_004.ksh index e6ad25f23f93..4f6ed775ecab 100755 --- a/tests/zfs-tests/tests/functional/user_namespace/user_namespace_004.ksh +++ b/tests/zfs-tests/tests/functional/user_namespace/user_namespace_004.ksh @@ -58,7 +58,7 @@ log_onexit user_ns_cleanup log_must zfs create -o zoned=on "$TESTPOOL/userns" # 1. Try to pass a non-namespace file to zfs zone. -temp_file="$(TMPDIR=$TEST_BASE_DIR mktemp)" +temp_file="$(mktemp)" log_mustnot zfs zone "$temp_file" "$TESTPOOL/userns" # 2. Try to pass a non-namespace and non-existent file to zfs zone. diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh index 9ebd5b149118..261898990b7e 100755 --- a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh +++ b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh @@ -47,19 +47,55 @@ if ! is_linux ; then log_unsupported "Only linux supports dd with oflag=dsync for FUA writes" fi -typeset datafile1="$(mktemp zvol_misc_fua1.XXXXXX)" -typeset datafile2="$(mktemp zvol_misc_fua2.XXXXXX)" +typeset datafile1="$(mktemp -t zvol_misc_fua1.XXXXXX)" +typeset datafile2="$(mktemp -t zvol_misc_fua2.XXXXXX)" +typeset datafile3="$(mktemp -t zvol_misc_fua3_log.XXXXXX)" typeset zvolpath=${ZVOL_DEVDIR}/$TESTPOOL/$TESTVOL +typeset DISK1=${DISKS%% *} function cleanup { - rm "$datafile1" "$datafile2" + log_must zpool remove $TESTPOOL $datafile3 + rm "$datafile1" "$datafile2" "$datafile2" +} + +# Prints the total number of sync writes for a vdev +# $1: vdev +function get_sync +{ + zpool iostat -p -H -v -r $TESTPOOL $1 | \ + awk '/[0-9]+$/{s+=$4+$5} END{print s}' } function do_test { # Wait for udev to create symlinks to our zvol block_device_wait $zvolpath + # Write using sync (creates FLUSH calls after writes, but not FUA) + old_vdev_writes=$(get_sync $DISK1) + old_log_writes=$(get_sync $datafile3) + + log_must fio --name=write_iops --size=5M \ + --ioengine=libaio --verify=0 --bs=4K \ + --iodepth=1 --rw=randwrite --group_reporting=1 \ + --filename=$zvolpath --sync=1 + + vdev_writes=$(( $(get_sync $DISK1) - $old_vdev_writes)) + log_writes=$(( $(get_sync $datafile3) - $old_log_writes)) + + # When we're doing sync writes, we should see many more writes go to + # the log vs the first vdev. Experiments show anywhere from a 160-320x + # ratio of writes to the log vs the first vdev (due to some straggler + # writes to the first vdev). + # + # Check that we have a large ratio (100x) of sync writes going to the + # log device + ratio=$(($log_writes / $vdev_writes)) + log_note "Got $log_writes log writes, $vdev_writes vdev writes." + if [ $ratio -lt 100 ] ; then + log_fail "Expected > 100x more log writes than vdev writes. " + fi + # Create a data file log_must dd if=/dev/urandom of="$datafile1" bs=1M count=5 @@ -80,6 +116,8 @@ log_assert "Verify that a ZFS volume can do Force Unit Access (FUA)" log_onexit cleanup log_must zfs set compression=off $TESTPOOL/$TESTVOL +log_must truncate -s 100M $datafile3 +log_must zpool add $TESTPOOL log $datafile3 log_note "Testing without blk-mq" diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh index 47cc42b9be7d..3c2d3b849c1c 100755 --- a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh +++ b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh @@ -40,6 +40,7 @@ # 5. TRIM the first 1MB and last 2MB of the 5MB block of data. # 6. Observe 2MB of used space on the zvol # 7. Verify the trimmed regions are zero'd on the zvol +# 8. Verify Secure Erase does not work on zvols (Linux only) verify_runnable "global" @@ -55,6 +56,7 @@ if is_linux ; then else trimcmd='blkdiscard' fi + secure_trimcmd="$trimcmd --secure" else # By default, FreeBSD 'trim' always does a dry-run. '-f' makes # it perform the actual operation. @@ -65,8 +67,8 @@ if ! is_physical_device $DISKS; then log_unsupported "This directory cannot be run on raw files." fi -typeset datafile1="$(mktemp zvol_misc_flags1.XXXXXX)" -typeset datafile2="$(mktemp zvol_misc_flags2.XXXXXX)" +typeset datafile1="$(mktemp -t zvol_misc_flags1.XXXXXX)" +typeset datafile2="$(mktemp -t zvol_misc_flags2.XXXXXX)" typeset zvolpath=${ZVOL_DEVDIR}/$TESTPOOL/$TESTVOL function cleanup @@ -113,6 +115,11 @@ function do_test { log_must diff $datafile1 $datafile2 log_must rm $datafile1 $datafile2 + + # Secure erase should not work (Linux check only). + if [ -n "$secure_trimcmd" ] ; then + log_mustnot $secure_trimcmd $zvolpath + fi } log_assert "Verify that a ZFS volume can be TRIMed" diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh index 3431d33d97d0..8d580911dea8 100755 --- a/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh +++ b/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh @@ -57,7 +57,7 @@ biggest_zvol_size_possible=$(largest_volsize_from_pool $TESTPOOL) typeset -f each_zvol_size=$(( floor($biggest_zvol_size_possible * 0.9 / \ $num_zvols ))) -typeset tmpdir="$(mktemp -d zvol_stress_fio_state.XXXXXX)" +typeset tmpdir="$(mktemp -t -d zvol_stress_fio_state.XXXXXX)" function create_zvols {