Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NAS-129309 / None / Copy offload support for block devices #203

Merged
merged 6 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions Documentation/ABI/stable/sysfs-block
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,29 @@ Description:
last zone of the device which may be smaller.


What: /sys/block/<disk>/queue/copy_max_bytes
Date: May 2024
Contact: [email protected]
Description:
[RW] This is the maximum number of bytes that the block layer
will allow for a copy request. This is always smaller or
equal to the maximum size allowed by the hardware, indicated by
'copy_max_hw_bytes'. An attempt to set a value higher than
'copy_max_hw_bytes' will truncate this to 'copy_max_hw_bytes'.
Writing '0' to this file will disable offloading copies for this
device, instead copy is done via emulation.


What: /sys/block/<disk>/queue/copy_max_hw_bytes
Date: May 2024
Contact: [email protected]
Description:
[RO] This is the maximum number of bytes that the hardware
will allow for single data copy request.
A value of 0 means that the device does not support
copy offload.


What: /sys/block/<disk>/queue/crypto/
Date: February 2022
Contact: [email protected]
Expand Down
7 changes: 7 additions & 0 deletions block/blk-core.c
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ static const char *const blk_op_name[] = {
REQ_OP_NAME(ZONE_FINISH),
REQ_OP_NAME(ZONE_APPEND),
REQ_OP_NAME(WRITE_ZEROES),
REQ_OP_NAME(COPY_SRC),
REQ_OP_NAME(COPY_DST),
REQ_OP_NAME(DRV_IN),
REQ_OP_NAME(DRV_OUT),
};
Expand Down Expand Up @@ -852,6 +854,11 @@ void submit_bio_noacct(struct bio *bio)
if (!bdev_is_zoned(bio->bi_bdev))
goto not_supported;
break;
case REQ_OP_COPY_SRC:
case REQ_OP_COPY_DST:
if (!q->limits.max_copy_sectors)
goto not_supported;
break;
case REQ_OP_DRV_IN:
case REQ_OP_DRV_OUT:
/*
Expand Down
225 changes: 225 additions & 0 deletions block/blk-lib.c
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,231 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
}
EXPORT_SYMBOL(blkdev_issue_discard);

static inline ssize_t blkdev_copy_sanity_check(struct block_device *bdev_in,
loff_t pos_in,
struct block_device *bdev_out,
loff_t pos_out, size_t len)
{
unsigned int align = max(bdev_logical_block_size(bdev_out),
bdev_logical_block_size(bdev_in)) - 1;

if ((pos_in & align) || (pos_out & align) || (len & align) || !len ||
len >= BLK_COPY_MAX_BYTES)
return -EINVAL;

return 0;
}

static inline void blkdev_copy_endio(struct blkdev_copy_io *cio)
{
if (cio->endio) {
cio->endio(cio->private, cio->status, cio->copied);
kfree(cio);
} else {
struct task_struct *waiter = cio->waiter;

WRITE_ONCE(cio->waiter, NULL);
blk_wake_io_task(waiter);
}
}

/*
* This must only be called once all bios have been issued so that the refcount
* can only decrease. This just waits for all bios to complete.
* Returns the length of bytes copied or error
*/
static ssize_t blkdev_copy_wait_for_completion_io(struct blkdev_copy_io *cio)
{
ssize_t ret;

for (;;) {
__set_current_state(TASK_UNINTERRUPTIBLE);
if (!READ_ONCE(cio->waiter))
break;
blk_io_schedule();
}
__set_current_state(TASK_RUNNING);
ret = cio->copied;
kfree(cio);

return ret;
}

static void blkdev_copy_offload_src_endio(struct bio *bio)
{
struct blkdev_copy_offload_io *offload_io = bio->bi_private;
struct blkdev_copy_io *cio = offload_io->cio;

if (bio->bi_status) {
cio->copied = min_t(ssize_t, offload_io->offset, cio->copied);
if (!cio->status)
cio->status = blk_status_to_errno(bio->bi_status);
}
bio_put(bio);
if (offload_io->dst_bio)
bio_put(offload_io->dst_bio);

kfree(offload_io);

if (atomic_dec_and_test(&cio->refcount))
blkdev_copy_endio(cio);
}

/*
* @bdev: source block device
* @pos_in: source offset
* @pos_out: destination offset
* @len: length in bytes to be copied
* @endio: endio function to be called on completion of copy operation,
* for synchronous operation this should be NULL
* @private: endio function will be called with this private data,
* for synchronous operation this should be NULL
* @gfp_mask: memory allocation flags (for bio_alloc)
* @bdev_out: destination block device
*
* For synchronous operation returns the length of bytes copied or error
* For asynchronous operation returns -EIOCBQUEUED or error
*
* Description:
* Copy source offset to destination offset within block device, using
* device's native copy offload feature.
* We perform copy operation using 2 bio's.
* 1. We take a plug and send a REQ_OP_COPY_DST bio along with destination
* sector and length. Once this bio reaches request layer, we form a
* request and wait for dst bio to arrive.
* 2. We issue REQ_OP_COPY_SRC bio along with source sector, length.
* Once this bio reaches request layer and find a request with previously
* sent destination info we merge the source bio and return.
* 3. Release the plug and request is sent to driver
* This design works only for drivers with request queue.
*/
ssize_t blkdev_copy_offload(struct block_device *bdev, loff_t pos_in,
loff_t pos_out, size_t len,
void (*endio)(void *, int, ssize_t),
void *private, gfp_t gfp, struct block_device *bdev_out)
{
struct blkdev_copy_io *cio;
struct blkdev_copy_offload_io *offload_io;
struct bio *src_bio, *dst_bio;
size_t rem, chunk;
ssize_t ret;
struct blk_plug plug;
int is_mq = 0;
size_t max_copy_bytes = min(bdev_max_copy_sectors(bdev) << SECTOR_SHIFT,
bdev_max_copy_sectors(bdev_out) << SECTOR_SHIFT);

if (!max_copy_bytes)
return -EOPNOTSUPP;

if (queue_is_mq(bdev->bd_queue)) {
if (bdev->bd_queue->mq_ops != bdev_out->bd_queue->mq_ops)
return -EOPNOTSUPP;
is_mq = 1;
} else if (!bdev->bd_disk->fops->submit_bio ||
bdev->bd_disk->fops->submit_bio != bdev_out->bd_disk->fops->submit_bio) {
return -EOPNOTSUPP;
}

/*
* Single queue only supported for zvols
*/
if (!is_mq && strncmp(bdev->bd_disk->disk_name, "zd", 2))
return -EOPNOTSUPP;

/*
* Cross device copy only supported for zvols
*/
if (bdev != bdev_out && strncmp(bdev_out->bd_disk->disk_name, "zd", 2))
return -EOPNOTSUPP;

ret = blkdev_copy_sanity_check(bdev, pos_in, bdev_out, pos_out, len);

if (ret)
return ret;

cio = kzalloc(sizeof(*cio), gfp);
if (!cio)
return -ENOMEM;
atomic_set(&cio->refcount, 1);
cio->waiter = current;
cio->endio = endio;
cio->private = private;

/*
* If there is a error, copied will be set to least successfully
* completed copied length
*/
cio->copied = len;
for (rem = len; rem > 0; rem -= chunk) {
chunk = min(rem, max_copy_bytes);

offload_io = kzalloc(sizeof(*offload_io), gfp);
if (!offload_io)
goto err_free_cio;
offload_io->cio = cio;
/*
* For partial completion, we use offload_io->offset to truncate
* successful copy length
*/
offload_io->offset = len - rem;
offload_io->driver_private = bdev_out->bd_queue->queuedata;

dst_bio = bio_alloc(bdev, 0, REQ_OP_COPY_DST, gfp);
if (!dst_bio)
goto err_free_offload_io;
dst_bio->bi_iter.bi_size = chunk;
dst_bio->bi_iter.bi_sector = pos_out >> SECTOR_SHIFT;

if (is_mq) {
blk_start_plug(&plug);
src_bio = blk_next_bio(dst_bio, bdev, 0, REQ_OP_COPY_SRC, gfp);
} else {
src_bio = bio_alloc(bdev, 0, REQ_OP_COPY_SRC, gfp);
}
if (!src_bio)
goto err_free_dst_bio;
src_bio->bi_iter.bi_size = chunk;
src_bio->bi_iter.bi_sector = pos_in >> SECTOR_SHIFT;
src_bio->bi_end_io = blkdev_copy_offload_src_endio;
src_bio->bi_private = offload_io;
offload_io->dst_bio = (is_mq) ? NULL : dst_bio;

atomic_inc(&cio->refcount);
submit_bio(src_bio);
if (is_mq)
blk_finish_plug(&plug);
pos_in += chunk;
pos_out += chunk;
}

if (atomic_dec_and_test(&cio->refcount))
blkdev_copy_endio(cio);
if (endio)
return -EIOCBQUEUED;

return blkdev_copy_wait_for_completion_io(cio);

err_free_dst_bio:
if (is_mq)
blk_finish_plug(&plug);
bio_put(dst_bio);
err_free_offload_io:
kfree(offload_io);
err_free_cio:
cio->copied = min_t(ssize_t, cio->copied, (len - rem));
cio->status = -ENOMEM;
if (rem == len) {
ret = cio->status;
kfree(cio);
return ret;
}
if (cio->endio)
return cio->status;

return blkdev_copy_wait_for_completion_io(cio);
}
EXPORT_SYMBOL_GPL(blkdev_copy_offload);

static sector_t bio_write_zeroes_limit(struct block_device *bdev)
{
sector_t bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
Expand Down
21 changes: 21 additions & 0 deletions block/blk-merge.c
Original file line number Diff line number Diff line change
Expand Up @@ -979,6 +979,9 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
if (!rq_mergeable(rq) || !bio_mergeable(bio))
return false;

if (blk_copy_offload_mergable(rq, bio))
return true;

if (req_op(rq) != bio_op(bio))
return false;

Expand Down Expand Up @@ -1015,6 +1018,8 @@ enum elv_merge blk_try_merge(struct request *rq, struct bio *bio)
{
if (blk_discard_mergable(rq))
return ELEVATOR_DISCARD_MERGE;
else if (blk_copy_offload_mergable(rq, bio))
return ELEVATOR_COPY_OFFLOAD_MERGE;
else if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector)
return ELEVATOR_BACK_MERGE;
else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector)
Expand Down Expand Up @@ -1122,6 +1127,20 @@ static enum bio_merge_status bio_attempt_discard_merge(struct request_queue *q,
return BIO_MERGE_FAILED;
}

static enum bio_merge_status bio_attempt_copy_offload_merge(struct request *req,
struct bio *bio)
{
if (req->__data_len != bio->bi_iter.bi_size)
return BIO_MERGE_FAILED;

req->biotail->bi_next = bio;
req->biotail = bio;
req->nr_phys_segments++;
req->__data_len += bio->bi_iter.bi_size;

return BIO_MERGE_OK;
}

static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
struct request *rq,
struct bio *bio,
Expand All @@ -1142,6 +1161,8 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
break;
case ELEVATOR_DISCARD_MERGE:
return bio_attempt_discard_merge(q, rq, bio);
case ELEVATOR_COPY_OFFLOAD_MERGE:
return bio_attempt_copy_offload_merge(rq, bio);
default:
return BIO_MERGE_NONE;
}
Expand Down
Loading