-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adding O_DIRECT support to ZFS to bypass the ARC for writes/reads. O_DIRECT support in ZFS will always ensure there is coherency between buffered and O_DIRECT IO requests. This ensures that all IO requests, whether buffered or direct, will see the same file contents at all times. Just as in other FS's , O_DIRECT does not imply O_SYNC. While data is written directly to VDEV disks, metadata will not be synced until the associated TXG is synced. For both O_DIRECT read and write request the offset and requeset sizes, at a minimum, must be PAGE_SIZE aligned. In the event they are not, then EINVAL is returned. For O_DIRECT writes: The request also must be block aligned (recordsize) or the write request will take the normal (buffered) write path. In the event that request is block aligned and a cached copy of the buffer in the ARC, then it will be discarded from the ARC forcing all further reads to retrieve the data from disk. For O_DIRECT reads: The only alignment restrictions are PAGE_SIZE alignment. In the event that the requested data is in buffered (in the ARC) it will just be copied from the ARC into the user buffer. To ensure data integrity for all data written using O_DIRECT, all user pages are made stable in the event one of the following is required: Checksum Compression Encryption Parity By making the user pages stable, we make sure the contents of the user provided buffer can not be changed after any of the above operations have taken place. A new dataset property `direct` has been added with the following 3 allowable values: disabled - Accepts O_DIRECT flag, but silently ignores it and treats the request as a buffered IO request. default - Follows the alignment restrictions outlined above for write/read IO requests when the O_DIRECT flag is used. always - Treats every write/read IO request as though it passed O_DIRECT and follows the alignment restirctions outlined above. Signed-off-by: Brian Atkinson <[email protected]> Co-authored-by: Mark Maybee <[email protected]> Co-authored-by: Brian Atkinson <[email protected]>
- Loading branch information
1 parent
eb8e535
commit 7240ecf
Showing
35 changed files
with
2,006 additions
and
298 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
dnl # | ||
dnl # get_user_pages_unlocked() function was not available till 4.0. | ||
dnl # | ||
dnl # long get_user_pages_unlocked(struct task_struct *tsk, | ||
dnl # struct mm_struct *mm, unsigned long start, unsigned long nr_pages, | ||
dnl # int write, int force, struct page **pages) | ||
dnl # 4.8 API Change | ||
dnl # long get_user_pages_unlocked(unsigned long start, | ||
dnl # unsigned long nr_pages, int write, int force, struct page **page) | ||
dnl # 4.9 API Change | ||
dnl # long get_user_pages_unlocked(usigned long start, int nr_pages, | ||
dnl # struct page **pages, unsigned int gup_flags) | ||
dnl # | ||
dnl # | ||
dnl # In earlier kernels (< 4.0) get_user_pages() is available | ||
dnl # | ||
|
||
dnl# | ||
dnl# Check available get_user_pages/_unlocked interfaces. | ||
dnl# | ||
AC_DEFUN([ZFS_AC_KERNEL_SRC_GET_USER_PAGES], [ | ||
ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_gup_flags], [ | ||
#include <linux/mm.h> | ||
], [ | ||
unsigned long start = 0; | ||
unsigned long nr_pages = 1; | ||
unsigned int gup_flags = 0; | ||
struct page **pages = NULL; | ||
long ret __attribute__ ((unused)); | ||
ret = get_user_pages_unlocked(start, nr_pages, pages, gup_flags); | ||
]) | ||
ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_write_flag], [ | ||
#include <linux/mm.h> | ||
], [ | ||
unsigned long start = 0; | ||
unsigned long nr_pages = 1; | ||
int write = 0; | ||
int force = 0; | ||
long ret __attribute__ ((unused)); | ||
struct page **pages = NULL; | ||
ret = get_user_pages_unlocked(start, nr_pages, write, force, pages); | ||
]) | ||
ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_task_struct], [ | ||
#include <linux/mm.h> | ||
], [ | ||
struct task_struct *tsk = NULL; | ||
struct mm_struct *mm = NULL; | ||
unsigned long start = 0; | ||
unsigned long nr_pages = 1; | ||
int write = 0; | ||
int force = 0; | ||
struct page **pages = NULL; | ||
long ret __attribute__ ((unused)); | ||
ret = get_user_pages_unlocked(tsk, mm, start, nr_pages, write, | ||
force, pages); | ||
]) | ||
ZFS_LINUX_TEST_SRC([get_user_pages_task_struct], [ | ||
#include <linux/mm.h> | ||
], [ | ||
struct task_struct *tsk = NULL; | ||
struct mm_struct *mm = NULL; | ||
struct vm_area_struct **vmas = NULL; | ||
unsigned long start = 0; | ||
unsigned long nr_pages = 1; | ||
int write = 0; | ||
int force = 0; | ||
struct page **pages = NULL; | ||
int ret __attribute__ ((unused)); | ||
ret = get_user_pages(tsk, mm, start, nr_pages, write, | ||
force, pages, vmas); | ||
]) | ||
]) | ||
|
||
dnl # | ||
dnl # Supported get_user_pages/_unlocked interfaces checked newest to oldest. | ||
dnl # We first check for get_user_pages_unlocked as that is available in | ||
dnl # newer kernels. | ||
dnl # | ||
AC_DEFUN([ZFS_AC_KERNEL_GET_USER_PAGES], [ | ||
dnl # | ||
dnl # Current API of get_user_pages_unlocked | ||
dnl # | ||
AC_MSG_CHECKING([whether get_user_pages_unlocked() takes gup flags]) | ||
ZFS_LINUX_TEST_RESULT([get_user_pages_unlocked_gup_flags], [ | ||
AC_MSG_RESULT(yes) | ||
AC_DEFINE(HAVE_GET_USER_PAGES_UNLOCKED_GUP_FLAGS, 1, | ||
[get_user_pages_unlocked() takes gup flags]) | ||
], [ | ||
AC_MSG_RESULT(no) | ||
dnl # | ||
dnl # 4.8 API change, get_user_pages_unlocked | ||
dnl # | ||
AC_MSG_CHECKING([whether get_user_pages_unlocked() takes write flag]) | ||
ZFS_LINUX_TEST_RESULT([get_user_pages_unlocked_write_flag], [ | ||
AC_MSG_RESULT(yes) | ||
AC_DEFINE(HAVE_GET_USER_PAGES_UNLOCKED_WRITE_FLAG, 1, | ||
[get_user_pages_unlocked() takes write flag]) | ||
], [ | ||
AC_MSG_RESULT(no) | ||
dnl # | ||
dnl # 4.0 API, get_user_pages_unlocked | ||
dnl # | ||
AC_MSG_CHECKING( | ||
[whether get_user_pages_unlocked() takes struct task_struct]) | ||
ZFS_LINUX_TEST_RESULT([get_user_pages_unlocked_task_struct], [ | ||
AC_MSG_RESULT(yes) | ||
AC_DEFINE(HAVE_GET_USER_PAGES_UNLOCKED_TASK_STRUCT, 1, | ||
[get_user_pages_unlocked() takes struct task_struct]) | ||
], [ | ||
AC_MSG_RESULT(no) | ||
dnl # get_user_pages | ||
AC_MSG_CHECKING( | ||
[whether get_user_pages() takes struct task_struct]) | ||
ZFS_LINUX_TEST_RESULT([get_user_pages_task_struct], [ | ||
AC_MSG_RESULT(yes) | ||
AC_DEFINE(HAVE_GET_USER_PAGES_TASK_STRUCT, 1, | ||
[get_user_pages() takes struct task_struct]) | ||
], [ | ||
dnl # | ||
dnl # If we can not map the users pages in | ||
dnl # then we can not do Direct IO | ||
dnl # | ||
ZFS_LINUX_TEST_ERROR([Direct IO]) | ||
]) | ||
]) | ||
]) | ||
]) | ||
]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
/* | ||
* Redistribution and use in source and binary forms, with or without | ||
* modification, are permitted provided that the following conditions | ||
* are met: | ||
* 1. Redistributions of source code must retain the above copyright | ||
* notice, this list of conditions and the following disclaimer. | ||
* 2. Redistributions in binary form must reproduce the above copyright | ||
* notice, this list of conditions and the following disclaimer in the | ||
* documentation and/or other materials provided with the distribution. | ||
* | ||
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND | ||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE | ||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | ||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | ||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | ||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | ||
* SUCH DAMAGE. | ||
* | ||
* $FreeBSD$ | ||
*/ | ||
|
||
#ifndef _SPL_PAGE_H_ | ||
#define _SPL_PAGE_H_ | ||
|
||
#include <sys/param.h> | ||
#include <sys/uio.h> | ||
|
||
#ifdef __cplusplus | ||
extern "C" { | ||
#endif | ||
|
||
typedef vm_page_t zfs_page_p; | ||
|
||
long zfs_hold_pages(unsigned long start, unsigned long nr_pages, int read, | ||
zfs_page_p *pages); | ||
long zfs_get_user_pages(unsigned long start, unsigned long nr_pages, int read, | ||
zfs_page_p *pages); | ||
void zfs_put_user_pages(zfs_page_p *pages, unsigned long nr_pages, | ||
boolean_t read); | ||
void zfs_set_page_to_stable(zfs_page_p page); | ||
void zfs_release_stable_page(zfs_page_p page); | ||
int zfs_uio_get_user_pages(uio_t *uio, zfs_page_p *pages, unsigned maxpages, | ||
enum uio_rw rw); | ||
|
||
#ifdef __cplusplus | ||
} | ||
#endif | ||
|
||
#endif /* _SPL_PAGE_H_ */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,7 @@ KERNEL_H = \ | |
errno.h \ | ||
fcntl.h \ | ||
file.h \ | ||
page.h \ | ||
inttypes.h \ | ||
isa_defs.h \ | ||
kmem_cache.h \ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
/* | ||
* Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. | ||
* Copyright (C) 2007 The Regents of the University of California. | ||
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). | ||
* Written by Brian Behlendorf <[email protected]>. | ||
* UCRL-CODE-235197 | ||
* | ||
* This file is part of the SPL, Solaris Porting Layer. | ||
* For details, see <http://zfsonlinux.org/>. | ||
* | ||
* The SPL is free software; you can redistribute it and/or modify it | ||
* under the terms of the GNU General Public License as published by the | ||
* Free Software Foundation; either version 2 of the License, or (at your | ||
* option) any later version. | ||
* | ||
* The SPL is distributed in the hope that it will be useful, but WITHOUT | ||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | ||
* for more details. | ||
* | ||
* You should have received a copy of the GNU General Public License along | ||
* with the SPL. If not, see <http://www.gnu.org/licenses/>. | ||
*/ | ||
|
||
#ifndef _SPL_PAGE_H | ||
#define _SPL_PAGE_H | ||
|
||
#include <linux/page-flags.h> | ||
#include <linux/pagemap.h> | ||
#include <linux/mm.h> | ||
#include <sys/types.h> | ||
#include <sys/uio.h> | ||
|
||
/* | ||
* read returning FOLL_WRITE is due to the fact that we are stating | ||
* that the kernel will have write access to the user pages. So, when | ||
* a Direct IO read request is issued, the kernel must write to the user | ||
* pages. | ||
* | ||
* get_user_pages_unlocked was not available to 4.0, so we also check | ||
* for get_user_pages on older kernels. | ||
*/ | ||
/* 4.9 API change - for and read flag is passed as gup flags */ | ||
#if defined(HAVE_GET_USER_PAGES_UNLOCKED_GUP_FLAGS) | ||
#define zfs_get_user_pages(addr, numpages, read, pages) \ | ||
get_user_pages_unlocked(addr, numpages, pages, read ? FOLL_WRITE : 0) | ||
|
||
/* 4.8 API change - no longer takes struct task_struct as arguement */ | ||
#elif defined(HAVE_GET_USER_PAGES_UNLOCKED_WRITE_FLAG) | ||
#define zfs_get_user_pages(addr, numpages, read, pages) \ | ||
get_user_pages_unlocked(addr, numpages, read, 0, pages) | ||
|
||
/* 4.0 API */ | ||
#elif defined(HAVE_GET_USER_PAGES_UNLOCKED_TASK_STRUCT) | ||
#define zfs_get_user_pages(addr, numpages, read, pages) \ | ||
get_user_pages_unlocked(current, current->mm, addr, numpages, read, 0, \ | ||
pages) | ||
|
||
/* Using get_user_pages if kernel is < 4.0 */ | ||
#elif defined(HAVE_GET_USER_PAGES_TASK_STRUCT) | ||
#define zfs_get_user_pages(addr, numpages, read, pages) \ | ||
get_user_pages(current, current->mm, addr, numpages, read, 0, pages, \ | ||
NULL) | ||
#else | ||
/* | ||
* This case is unreachable. We must be able to use either | ||
* get_user_pages_unlocked() or get_user_pages() to map user pages into | ||
* the kernel. | ||
*/ | ||
#error "Unknown Direct IO interface" | ||
#endif | ||
|
||
typedef struct page *zfs_page_p; | ||
|
||
void zfs_put_user_pages(zfs_page_p *pages, unsigned long nr_pages, | ||
boolean_t read); | ||
void zfs_set_page_to_stable(zfs_page_p page); | ||
void zfs_release_stable_page(zfs_page_p page); | ||
int zfs_uio_get_user_pages(uio_t *uio, zfs_page_p *pages, unsigned maxpages, | ||
enum uio_rw rw); | ||
|
||
#endif /* _SPL_PAGE_H */ |
Oops, something went wrong.