Skip to content

Commit d67eb17

Browse files
bwatkinsonbehlendorf
authored andcommitted
Use pin_user_pages API for Direct I/O requests
As of kernel v5.8, pin_user_pages* interfaced were introduced. These interfaces use the FOLL_PIN flag. This is preferred interface now for Direct I/O requests in the kernel. The reasoning for using this new interface for Direct I/O requests is explained in the kernel documenetation: Documentation/core-api/pin_user_pages.rst If pin_user_pages_unlocked is available, the all Direct I/O requests will use this new API to stay uptodate with the kernel API requirements. Reviewed-by: Alexander Motin <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Signed-off-by: Brian Atkinson <[email protected]> Closes openzfs#16856
1 parent 1862c1c commit d67eb17

File tree

4 files changed

+148
-41
lines changed

4 files changed

+148
-41
lines changed

Diff for: config/kernel-pin-user-pages.m4

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
dnl #
2+
dnl # Check for pin_user_pages_unlocked().
3+
dnl #
4+
AC_DEFUN([ZFS_AC_KERNEL_SRC_PIN_USER_PAGES], [
5+
ZFS_LINUX_TEST_SRC([pin_user_pages_unlocked], [
6+
#include <linux/mm.h>
7+
],[
8+
unsigned long start = 0;
9+
unsigned long nr_pages = 1;
10+
struct page **pages = NULL;
11+
unsigned int gup_flags = 0;
12+
long ret __attribute__ ((unused));
13+
14+
ret = pin_user_pages_unlocked(start, nr_pages, pages,
15+
gup_flags);
16+
])
17+
])
18+
19+
AC_DEFUN([ZFS_AC_KERNEL_PIN_USER_PAGES], [
20+
21+
dnl #
22+
dnl # Kernal 5.8 introduced the pin_user_pages* interfaces which should
23+
dnl # be used for Direct I/O requests.
24+
dnl #
25+
AC_MSG_CHECKING([whether pin_user_pages_unlocked() is available])
26+
ZFS_LINUX_TEST_RESULT([pin_user_pages_unlocked], [
27+
AC_MSG_RESULT(yes)
28+
AC_DEFINE(HAVE_PIN_USER_PAGES_UNLOCKED, 1,
29+
[pin_user_pages_unlocked() is available])
30+
],[
31+
AC_MSG_RESULT(no)
32+
])
33+
])

Diff for: config/kernel-vfs-iov_iter.m4

+19-24
Original file line numberDiff line numberDiff line change
@@ -13,26 +13,21 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_IOV_ITER], [
1313
error = fault_in_iov_iter_readable(&iter, size);
1414
])
1515
16-
ZFS_LINUX_TEST_SRC([iov_iter_get_pages2], [
16+
ZFS_LINUX_TEST_SRC([iov_iter_type], [
17+
#include <linux/fs.h>
1718
#include <linux/uio.h>
1819
],[
1920
struct iov_iter iter = { 0 };
20-
struct page **pages = NULL;
21-
size_t maxsize = 4096;
22-
unsigned maxpages = 1;
23-
size_t start;
24-
size_t ret __attribute__ ((unused));
25-
26-
ret = iov_iter_get_pages2(&iter, pages, maxsize, maxpages,
27-
&start);
21+
__attribute__((unused)) enum iter_type i = iov_iter_type(&iter);
2822
])
2923
30-
ZFS_LINUX_TEST_SRC([iov_iter_type], [
31-
#include <linux/fs.h>
24+
ZFS_LINUX_TEST_SRC([iter_is_ubuf], [
3225
#include <linux/uio.h>
3326
],[
3427
struct iov_iter iter = { 0 };
35-
__attribute__((unused)) enum iter_type i = iov_iter_type(&iter);
28+
bool ret __attribute__((unused));
29+
30+
ret = iter_is_ubuf(&iter);
3631
])
3732
3833
ZFS_LINUX_TEST_SRC([iter_iov], [
@@ -55,18 +50,6 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [
5550
AC_MSG_RESULT(no)
5651
])
5752
58-
dnl #
59-
dnl # Kernel 6.0 changed iov_iter_get_pages() to iov_iter_page_pages2().
60-
dnl #
61-
AC_MSG_CHECKING([whether iov_iter_get_pages2() is available])
62-
ZFS_LINUX_TEST_RESULT([iov_iter_get_pages2], [
63-
AC_MSG_RESULT(yes)
64-
AC_DEFINE(HAVE_IOV_ITER_GET_PAGES2, 1,
65-
[iov_iter_get_pages2() is available])
66-
],[
67-
AC_MSG_RESULT(no)
68-
])
69-
7053
dnl #
7154
dnl # This checks for iov_iter_type() in linux/uio.h. It is not
7255
dnl # required, however, and the module will compiled without it
@@ -81,6 +64,18 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [
8164
AC_MSG_RESULT(no)
8265
])
8366
67+
dnl #
68+
dnl # Kernel 6.0 introduced the ITER_UBUF iov_iter type. iter_is_ubuf()
69+
dnl # was also added to determine if the iov_iter is an ITER_UBUF.
70+
dnl #
71+
AC_MSG_CHECKING([whether iter_is_ubuf() is available])
72+
ZFS_LINUX_TEST_RESULT([iter_is_ubuf], [
73+
AC_MSG_RESULT(yes)
74+
AC_DEFINE(HAVE_ITER_IS_UBUF, 1, [iter_is_ubuf() is available])
75+
],[
76+
AC_MSG_RESULT(no)
77+
])
78+
8479
dnl #
8580
dnl # Kernel 6.5 introduces the iter_iov() function that returns the
8681
dnl # __iov member of an iov_iter*. The iov member was renamed to this

Diff for: config/kernel.m4

+2
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
127127
ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE
128128
ZFS_AC_KERNEL_SRC_MM_PAGE_MAPPING
129129
ZFS_AC_KERNEL_SRC_FILE
130+
ZFS_AC_KERNEL_SRC_PIN_USER_PAGES
130131
case "$host_cpu" in
131132
powerpc*)
132133
ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
@@ -238,6 +239,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
238239
ZFS_AC_KERNEL_MM_PAGE_MAPPING
239240
ZFS_AC_KERNEL_1ARG_ASSIGN_STR
240241
ZFS_AC_KERNEL_FILE
242+
ZFS_AC_KERNEL_PIN_USER_PAGES
241243
case "$host_cpu" in
242244
powerpc*)
243245
ZFS_AC_KERNEL_CPU_HAS_FEATURE

Diff for: module/os/linux/zfs/zfs_uio.c

+94-17
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,7 @@ zfs_unmark_page(struct page *page)
441441
}
442442
#endif /* HAVE_ZERO_PAGE_GPL_ONLY || !_LP64 */
443443

444+
#if !defined(HAVE_PIN_USER_PAGES_UNLOCKED)
444445
static void
445446
zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio)
446447
{
@@ -472,6 +473,7 @@ zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio)
472473
}
473474
}
474475
}
476+
#endif
475477

476478
void
477479
zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
@@ -480,6 +482,9 @@ zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
480482
ASSERT(uio->uio_extflg & UIO_DIRECT);
481483
ASSERT3P(uio->uio_dio.pages, !=, NULL);
482484

485+
#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
486+
unpin_user_pages(uio->uio_dio.pages, uio->uio_dio.npages);
487+
#else
483488
for (long i = 0; i < uio->uio_dio.npages; i++) {
484489
struct page *p = uio->uio_dio.pages[i];
485490

@@ -491,51 +496,114 @@ zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
491496

492497
put_page(p);
493498
}
494-
499+
#endif
495500
vmem_free(uio->uio_dio.pages,
496501
uio->uio_dio.npages * sizeof (struct page *));
497502
}
498503

504+
#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
499505
static int
500-
zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw)
506+
zfs_uio_pin_user_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
501507
{
508+
long res;
502509
size_t skip = uio->uio_skip;
510+
size_t len = uio->uio_resid - skip;
511+
unsigned int gup_flags = 0;
512+
unsigned long addr;
513+
unsigned long nr_pages;
514+
515+
/*
516+
* Kernel 6.2 introduced the FOLL_PCI_P2PDMA flag. This flag could
517+
* possibly be used here in the future to allow for P2P operations with
518+
* user pages.
519+
*/
520+
if (rw == UIO_READ)
521+
gup_flags = FOLL_WRITE;
522+
523+
if (len == 0)
524+
return (0);
525+
526+
#if defined(HAVE_ITER_IS_UBUF)
527+
if (iter_is_ubuf(uio->uio_iter)) {
528+
nr_pages = DIV_ROUND_UP(len, PAGE_SIZE);
529+
addr = (unsigned long)uio->uio_iter->ubuf + skip;
530+
res = pin_user_pages_unlocked(addr, nr_pages,
531+
&uio->uio_dio.pages[uio->uio_dio.npages], gup_flags);
532+
if (res < 0) {
533+
return (SET_ERROR(-res));
534+
} else if (len != (res * PAGE_SIZE)) {
535+
uio->uio_dio.npages += res;
536+
return (SET_ERROR(EFAULT));
537+
}
538+
uio->uio_dio.npages += res;
539+
return (0);
540+
}
541+
#endif
542+
const struct iovec *iovp = zfs_uio_iter_iov(uio->uio_iter);
543+
for (int i = 0; i < uio->uio_iovcnt; i++) {
544+
size_t amt = iovp->iov_len - skip;
545+
if (amt == 0) {
546+
iovp++;
547+
skip = 0;
548+
continue;
549+
}
550+
551+
addr = (unsigned long)iovp->iov_base + skip;
552+
nr_pages = DIV_ROUND_UP(amt, PAGE_SIZE);
553+
res = pin_user_pages_unlocked(addr, nr_pages,
554+
&uio->uio_dio.pages[uio->uio_dio.npages], gup_flags);
555+
if (res < 0) {
556+
return (SET_ERROR(-res));
557+
} else if (amt != (res * PAGE_SIZE)) {
558+
uio->uio_dio.npages += res;
559+
return (SET_ERROR(EFAULT));
560+
}
561+
562+
len -= amt;
563+
uio->uio_dio.npages += res;
564+
skip = 0;
565+
iovp++;
566+
};
567+
568+
ASSERT0(len);
569+
570+
return (0);
571+
}
572+
573+
#else
574+
static int
575+
zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw)
576+
{
577+
size_t start;
503578
size_t wanted = uio->uio_resid - uio->uio_skip;
504579
ssize_t rollback = 0;
505580
ssize_t cnt;
506581
unsigned maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE);
507582

508583
while (wanted) {
509-
#if defined(HAVE_IOV_ITER_GET_PAGES2)
510-
cnt = iov_iter_get_pages2(uio->uio_iter,
511-
&uio->uio_dio.pages[uio->uio_dio.npages],
512-
wanted, maxpages, &skip);
513-
#else
514584
cnt = iov_iter_get_pages(uio->uio_iter,
515585
&uio->uio_dio.pages[uio->uio_dio.npages],
516-
wanted, maxpages, &skip);
517-
#endif
586+
wanted, maxpages, &start);
518587
if (cnt < 0) {
519588
iov_iter_revert(uio->uio_iter, rollback);
520589
return (SET_ERROR(-cnt));
521590
}
591+
/*
592+
* All Direct I/O operations must be page aligned.
593+
*/
594+
ASSERT(IS_P2ALIGNED(start, PAGE_SIZE));
522595
uio->uio_dio.npages += DIV_ROUND_UP(cnt, PAGE_SIZE);
523596
rollback += cnt;
524597
wanted -= cnt;
525-
skip = 0;
526-
#if !defined(HAVE_IOV_ITER_GET_PAGES2)
527-
/*
528-
* iov_iter_get_pages2() advances the iov_iter on success.
529-
*/
530598
iov_iter_advance(uio->uio_iter, cnt);
531-
#endif
532599

533600
}
534601
ASSERT3U(rollback, ==, uio->uio_resid - uio->uio_skip);
535602
iov_iter_revert(uio->uio_iter, rollback);
536603

537604
return (0);
538605
}
606+
#endif /* HAVE_PIN_USER_PAGES_UNLOCKED */
539607

540608
/*
541609
* This function pins user pages. In the event that the user pages were not
@@ -552,25 +620,34 @@ zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw)
552620

553621
if (uio->uio_segflg == UIO_ITER) {
554622
uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP);
623+
#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
624+
error = zfs_uio_pin_user_pages(uio, rw);
625+
#else
555626
error = zfs_uio_get_dio_pages_iov_iter(uio, rw);
627+
#endif
556628
} else {
557629
return (SET_ERROR(EOPNOTSUPP));
558630
}
559631

560632
ASSERT3S(uio->uio_dio.npages, >=, 0);
561633

562634
if (error) {
635+
#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
636+
unpin_user_pages(uio->uio_dio.pages, uio->uio_dio.npages);
637+
#else
563638
for (long i = 0; i < uio->uio_dio.npages; i++)
564639
put_page(uio->uio_dio.pages[i]);
640+
#endif
565641
vmem_free(uio->uio_dio.pages, size);
566642
return (error);
567643
} else {
568644
ASSERT3S(uio->uio_dio.npages, ==, npages);
569645
}
570646

571-
if (rw == UIO_WRITE) {
647+
#if !defined(HAVE_PIN_USER_PAGES_UNLOCKED)
648+
if (rw == UIO_WRITE)
572649
zfs_uio_dio_check_for_zero_page(uio);
573-
}
650+
#endif
574651

575652
uio->uio_extflg |= UIO_DIRECT;
576653

0 commit comments

Comments
 (0)