Skip to content

Commit

Permalink
WIP Direct IO
Browse files Browse the repository at this point in the history
Adding O_DIRECT support to ZFS to bypass the ARC for writes/reads.

O_DIRECT support in ZFS will always ensure there is coherency between
buffered and O_DIRECT IO requests. This ensures that all IO requests,
whether buffered or direct, will see the same file contents at all
times. Just as in other FS's , O_DIRECT does not imply O_SYNC. While
data is written directly to VDEV disks, metadata will not be synced
until the associated  TXG is synced.
For both O_DIRECT read and write request the offset and requeset sizes,
at a minimum, must be PAGE_SIZE aligned. In the event they are not,
then EINVAL is returned.

For O_DIRECT writes:
The request also must be block aligned (recordsize) or the write
request will take the normal (buffered) write path. In the event that
request is block aligned and a cached copy of the buffer in the ARC,
then it will be discarded from the ARC forcing all further reads to
retrieve the data from disk.

For O_DIRECT reads:
The only alignment restrictions are PAGE_SIZE alignment. In the event
that the requested data is in buffered (in the ARC) it will just be
copied from the ARC into the user buffer.

To ensure data integrity for all data written using O_DIRECT, all user
pages are made stable in the event one of the following is required:
Checksum
Compression
Encryption
Parity
By making the user pages stable, we make sure the contents of the user
provided buffer can not be changed after any of the above operations
have taken place.

A new dataset property `direct` has been added with the following 3
allowable values:
disabled - Accepts O_DIRECT flag, but silently ignores it and treats
	   the request as a buffered IO request.
default  - Follows the alignment restrictions  outlined above for
	   write/read IO requests when the O_DIRECT flag is used.
always   - Treats every write/read IO request as though it passed
           O_DIRECT and follows the alignment restirctions outlined
	   above.

Signed-off-by: Brian Atkinson <[email protected]>
Co-authored-by: Mark Maybee <[email protected]>
Co-authored-by: Brian Atkinson <[email protected]>
  • Loading branch information
bwatkinson and Mark Maybee committed Nov 24, 2020
1 parent eb8e535 commit 7240ecf
Show file tree
Hide file tree
Showing 35 changed files with 2,006 additions and 298 deletions.
135 changes: 135 additions & 0 deletions config/kernel-get-user-pages.m4
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
dnl #
dnl # get_user_pages_unlocked() function was not available till 4.0.
dnl #
dnl # long get_user_pages_unlocked(struct task_struct *tsk,
dnl # struct mm_struct *mm, unsigned long start, unsigned long nr_pages,
dnl # int write, int force, struct page **pages)
dnl # 4.8 API Change
dnl # long get_user_pages_unlocked(unsigned long start,
dnl # unsigned long nr_pages, int write, int force, struct page **page)
dnl # 4.9 API Change
dnl # long get_user_pages_unlocked(usigned long start, int nr_pages,
dnl # struct page **pages, unsigned int gup_flags)
dnl #
dnl #
dnl # In earlier kernels (< 4.0) get_user_pages() is available
dnl #

dnl#
dnl# Check available get_user_pages/_unlocked interfaces.
dnl#
AC_DEFUN([ZFS_AC_KERNEL_SRC_GET_USER_PAGES], [
ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_gup_flags], [
#include <linux/mm.h>
], [
unsigned long start = 0;
unsigned long nr_pages = 1;
unsigned int gup_flags = 0;
struct page **pages = NULL;
long ret __attribute__ ((unused));
ret = get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
])
ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_write_flag], [
#include <linux/mm.h>
], [
unsigned long start = 0;
unsigned long nr_pages = 1;
int write = 0;
int force = 0;
long ret __attribute__ ((unused));
struct page **pages = NULL;
ret = get_user_pages_unlocked(start, nr_pages, write, force, pages);
])
ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_task_struct], [
#include <linux/mm.h>
], [
struct task_struct *tsk = NULL;
struct mm_struct *mm = NULL;
unsigned long start = 0;
unsigned long nr_pages = 1;
int write = 0;
int force = 0;
struct page **pages = NULL;
long ret __attribute__ ((unused));
ret = get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
force, pages);
])
ZFS_LINUX_TEST_SRC([get_user_pages_task_struct], [
#include <linux/mm.h>
], [
struct task_struct *tsk = NULL;
struct mm_struct *mm = NULL;
struct vm_area_struct **vmas = NULL;
unsigned long start = 0;
unsigned long nr_pages = 1;
int write = 0;
int force = 0;
struct page **pages = NULL;
int ret __attribute__ ((unused));
ret = get_user_pages(tsk, mm, start, nr_pages, write,
force, pages, vmas);
])
])

dnl #
dnl # Supported get_user_pages/_unlocked interfaces checked newest to oldest.
dnl # We first check for get_user_pages_unlocked as that is available in
dnl # newer kernels.
dnl #
AC_DEFUN([ZFS_AC_KERNEL_GET_USER_PAGES], [
dnl #
dnl # Current API of get_user_pages_unlocked
dnl #
AC_MSG_CHECKING([whether get_user_pages_unlocked() takes gup flags])
ZFS_LINUX_TEST_RESULT([get_user_pages_unlocked_gup_flags], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_GET_USER_PAGES_UNLOCKED_GUP_FLAGS, 1,
[get_user_pages_unlocked() takes gup flags])
], [
AC_MSG_RESULT(no)
dnl #
dnl # 4.8 API change, get_user_pages_unlocked
dnl #
AC_MSG_CHECKING([whether get_user_pages_unlocked() takes write flag])
ZFS_LINUX_TEST_RESULT([get_user_pages_unlocked_write_flag], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_GET_USER_PAGES_UNLOCKED_WRITE_FLAG, 1,
[get_user_pages_unlocked() takes write flag])
], [
AC_MSG_RESULT(no)
dnl #
dnl # 4.0 API, get_user_pages_unlocked
dnl #
AC_MSG_CHECKING(
[whether get_user_pages_unlocked() takes struct task_struct])
ZFS_LINUX_TEST_RESULT([get_user_pages_unlocked_task_struct], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_GET_USER_PAGES_UNLOCKED_TASK_STRUCT, 1,
[get_user_pages_unlocked() takes struct task_struct])
], [
AC_MSG_RESULT(no)
dnl # get_user_pages
AC_MSG_CHECKING(
[whether get_user_pages() takes struct task_struct])
ZFS_LINUX_TEST_RESULT([get_user_pages_task_struct], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_GET_USER_PAGES_TASK_STRUCT, 1,
[get_user_pages() takes struct task_struct])
], [
dnl #
dnl # If we can not map the users pages in
dnl # then we can not do Direct IO
dnl #
ZFS_LINUX_TEST_ERROR([Direct IO])
])
])
])
])
])
2 changes: 2 additions & 0 deletions config/kernel.m4
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
ZFS_AC_KERNEL_SRC_VFS_GETATTR
ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS
ZFS_AC_KERNEL_SRC_VFS_ITERATE
ZFS_AC_KERNEL_SRC_GET_USER_PAGES
ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO
ZFS_AC_KERNEL_SRC_VFS_RW_ITERATE
ZFS_AC_KERNEL_SRC_VFS_GENERIC_WRITE_CHECKS
Expand Down Expand Up @@ -200,6 +201,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
ZFS_AC_KERNEL_VFS_GETATTR
ZFS_AC_KERNEL_VFS_FSYNC_2ARGS
ZFS_AC_KERNEL_VFS_ITERATE
ZFS_AC_KERNEL_GET_USER_PAGES
ZFS_AC_KERNEL_VFS_DIRECT_IO
ZFS_AC_KERNEL_VFS_RW_ITERATE
ZFS_AC_KERNEL_VFS_GENERIC_WRITE_CHECKS
Expand Down
1 change: 1 addition & 0 deletions include/os/freebsd/spl/sys/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ KERNEL_H = \
extdirent.h \
file.h \
freebsd_rwlock.h \
page.h \
inttypes.h \
isa_defs.h \
kmem_cache.h \
Expand Down
1 change: 1 addition & 0 deletions include/os/freebsd/spl/sys/mutex.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,4 +69,5 @@ typedef enum {
#define mutex_exit(lock) sx_xunlock(lock)
#define mutex_owned(lock) sx_xlocked(lock)
#define mutex_owner(lock) sx_xholder(lock)

#endif /* _OPENSOLARIS_SYS_MUTEX_H_ */
53 changes: 53 additions & 0 deletions include/os/freebsd/spl/sys/page.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/

#ifndef _SPL_PAGE_H_
#define _SPL_PAGE_H_

#include <sys/param.h>
#include <sys/uio.h>

#ifdef __cplusplus
extern "C" {
#endif

typedef vm_page_t zfs_page_p;

long zfs_hold_pages(unsigned long start, unsigned long nr_pages, int read,
zfs_page_p *pages);
long zfs_get_user_pages(unsigned long start, unsigned long nr_pages, int read,
zfs_page_p *pages);
void zfs_put_user_pages(zfs_page_p *pages, unsigned long nr_pages,
boolean_t read);
void zfs_set_page_to_stable(zfs_page_p page);
void zfs_release_stable_page(zfs_page_p page);
int zfs_uio_get_user_pages(uio_t *uio, zfs_page_p *pages, unsigned maxpages,
enum uio_rw rw);

#ifdef __cplusplus
}
#endif

#endif /* _SPL_PAGE_H_ */
2 changes: 0 additions & 2 deletions include/os/freebsd/spl/sys/uio.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,6 @@
#include <sys/_uio.h>
#include <sys/debug.h>



#define uio_loffset uio_offset

typedef struct uio uio_t;
Expand Down
21 changes: 21 additions & 0 deletions include/os/freebsd/spl/sys/vm.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,15 @@ void zfs_vmobject_wunlock(vm_object_t object);
#define vm_page_grab_valid_unlocked(m, obj, idx, flags) \
vm_page_grab_valid((m), (obj), (idx), (flags))
#endif

#if __FreeBSD_version >= 1300047
#define vm_page_wire_lock(pp)
#define vm_page_wire_unlock(pp)
#else
#define vm_page_wire_lock(pp) vm_page_lock(pp)
#define vm_page_wire_unlock(pp) vm_page_unlock(pp)
#endif

static inline caddr_t
zfs_map_page(vm_page_t pp, struct sf_buf **sfp)
{
Expand All @@ -70,4 +79,16 @@ zfs_unmap_page(struct sf_buf *sf)
sf_buf_free(sf);
}

static inline void
page_unhold(vm_page_t pp)
{
vm_page_wire_lock(pp);
#if __FreeBSD_version >= 1300035
vm_page_unwire(pp, PQ_ACTIVE);
#else
vm_page_unhold(pp);
#endif
vm_page_wire_unlock(pp);
}

#endif /* _OPENSOLARIS_SYS_VM_H_ */
1 change: 1 addition & 0 deletions include/os/linux/spl/sys/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ KERNEL_H = \
errno.h \
fcntl.h \
file.h \
page.h \
inttypes.h \
isa_defs.h \
kmem_cache.h \
Expand Down
82 changes: 82 additions & 0 deletions include/os/linux/spl/sys/page.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
/*
* Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
* Copyright (C) 2007 The Regents of the University of California.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Brian Behlendorf <[email protected]>.
* UCRL-CODE-235197
*
* This file is part of the SPL, Solaris Porting Layer.
* For details, see <http://zfsonlinux.org/>.
*
* The SPL is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
* Free Software Foundation; either version 2 of the License, or (at your
* option) any later version.
*
* The SPL is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* for more details.
*
* You should have received a copy of the GNU General Public License along
* with the SPL. If not, see <http://www.gnu.org/licenses/>.
*/

#ifndef _SPL_PAGE_H
#define _SPL_PAGE_H

#include <linux/page-flags.h>
#include <linux/pagemap.h>
#include <linux/mm.h>
#include <sys/types.h>
#include <sys/uio.h>

/*
* read returning FOLL_WRITE is due to the fact that we are stating
* that the kernel will have write access to the user pages. So, when
* a Direct IO read request is issued, the kernel must write to the user
* pages.
*
* get_user_pages_unlocked was not available to 4.0, so we also check
* for get_user_pages on older kernels.
*/
/* 4.9 API change - for and read flag is passed as gup flags */
#if defined(HAVE_GET_USER_PAGES_UNLOCKED_GUP_FLAGS)
#define zfs_get_user_pages(addr, numpages, read, pages) \
get_user_pages_unlocked(addr, numpages, pages, read ? FOLL_WRITE : 0)

/* 4.8 API change - no longer takes struct task_struct as arguement */
#elif defined(HAVE_GET_USER_PAGES_UNLOCKED_WRITE_FLAG)
#define zfs_get_user_pages(addr, numpages, read, pages) \
get_user_pages_unlocked(addr, numpages, read, 0, pages)

/* 4.0 API */
#elif defined(HAVE_GET_USER_PAGES_UNLOCKED_TASK_STRUCT)
#define zfs_get_user_pages(addr, numpages, read, pages) \
get_user_pages_unlocked(current, current->mm, addr, numpages, read, 0, \
pages)

/* Using get_user_pages if kernel is < 4.0 */
#elif defined(HAVE_GET_USER_PAGES_TASK_STRUCT)
#define zfs_get_user_pages(addr, numpages, read, pages) \
get_user_pages(current, current->mm, addr, numpages, read, 0, pages, \
NULL)
#else
/*
* This case is unreachable. We must be able to use either
* get_user_pages_unlocked() or get_user_pages() to map user pages into
* the kernel.
*/
#error "Unknown Direct IO interface"
#endif

typedef struct page *zfs_page_p;

void zfs_put_user_pages(zfs_page_p *pages, unsigned long nr_pages,
boolean_t read);
void zfs_set_page_to_stable(zfs_page_p page);
void zfs_release_stable_page(zfs_page_p page);
int zfs_uio_get_user_pages(uio_t *uio, zfs_page_p *pages, unsigned maxpages,
enum uio_rw rw);

#endif /* _SPL_PAGE_H */
Loading

0 comments on commit 7240ecf

Please sign in to comment.