| // SPDX-License-Identifier: GPL-2.0-or-later |
| /* Direct I/O support. |
| * |
| * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. |
| * Written by David Howells (dhowells@redhat.com) |
| */ |
| |
| #include <linux/export.h> |
| #include <linux/fs.h> |
| #include <linux/mm.h> |
| #include <linux/pagemap.h> |
| #include <linux/slab.h> |
| #include <linux/uio.h> |
| #include <linux/sched/mm.h> |
| #include <linux/task_io_accounting_ops.h> |
| #include <linux/netfs.h> |
| #include "internal.h" |
| |
| /* |
| * If we did a direct read to a bounce buffer (say we needed to decrypt it), |
| * copy the data obtained to the destination iterator. |
| */ |
| int netfs_dio_copy_bounce_to_dest(struct netfs_io_request *rreq, struct kiocb *iocb) |
| { |
| struct iov_iter *dest_iter = &rreq->buffer.iter; |
| unsigned long long start = rreq->start; |
| |
| _enter("%zx/%llx @%llx", rreq->transferred, rreq->len, start); |
| |
| if (!iocb) |
| iocb = rreq->iocb; |
| |
| if (!test_bit(NETFS_RREQ_USE_BOUNCE_BUFFER, &rreq->flags)) |
| return 0; |
| if (!test_bit(NETFS_RREQ_CRYPT_IN_PLACE, &rreq->flags)) { |
| if (!rreq->transferred) { |
| trace_netfs_failure(rreq, NULL, -EIO, netfs_fail_dio_read_zero); |
| return -EIO; |
| } |
| return 0; |
| } |
| |
| if (start < iocb->ki_pos) { |
| if (rreq->transferred <= iocb->ki_pos - start) { |
| trace_netfs_failure(rreq, NULL, -EIO, netfs_fail_dio_read_short); |
| return -EIO; |
| } |
| rreq->len = rreq->transferred; |
| rreq->transferred -= iocb->ki_pos - start; |
| } |
| |
| if (rreq->transferred > iov_iter_count(dest_iter)) |
| rreq->transferred = iov_iter_count(dest_iter); |
| |
| _debug("xfer %zx/%llx @%llx", rreq->transferred, rreq->len, iocb->ki_pos); |
| |
| if (!rreq->transferred || !iov_iter_count(dest_iter)) { |
| trace_netfs_failure(rreq, NULL, -EIO, netfs_fail_dio_read_zero); |
| return -EIO; |
| } |
| |
| return rolling_buffer_copy_to_iter(&rreq->bounce, dest_iter, |
| iocb->ki_pos, rreq->transferred); |
| } |
| |
| /* |
| * Prepare the buffer iterator for the network filesystem. We may limit the |
| * size of the request to the maximum read size, the maximum number of segments |
| * or to trim it to a block size. |
| */ |
| void netfs_prepare_dio_read_iterator(struct netfs_io_subrequest *subreq, size_t bsize) |
| { |
| struct netfs_io_request *rreq = subreq->rreq; |
| struct iov_iter *iter = &rreq->buffer.iter; |
| size_t rsize; |
| |
| if (test_bit(NETFS_RREQ_USE_BOUNCE_BUFFER, &rreq->flags)) |
| iter = &rreq->bounce.iter; |
| |
| rsize = umin(subreq->len, rreq->io_streams[0].sreq_max_len); |
| subreq->len = rsize; |
| |
| if (unlikely(rreq->io_streams[0].sreq_max_segs)) { |
| size_t limit = netfs_limit_iter(iter, 0, rsize, |
| rreq->io_streams[0].sreq_max_segs); |
| |
| if (limit < rsize) { |
| subreq->len = limit; |
| trace_netfs_sreq(subreq, netfs_sreq_trace_limited); |
| } |
| } |
| |
| /* Don't split a crypto block across multiple transfers unless the |
| * transport won't support a transfer that big. |
| */ |
| if (subreq->len > bsize) |
| subreq->len = round_down(subreq->len, bsize); |
| |
| trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); |
| |
| subreq->io_iter = *iter; |
| iov_iter_truncate(&subreq->io_iter, subreq->len); |
| iov_iter_advance(iter, subreq->len); |
| } |
| |
| /* |
| * Perform a read to a buffer from the server, slicing up the region to be read |
| * according to the network rsize. |
| */ |
| static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq) |
| { |
| struct netfs_io_stream *stream = &rreq->io_streams[0]; |
| unsigned long long start = rreq->start; |
| ssize_t size = rreq->len; |
| size_t bsize = rreq->crypto_bsize; |
| int ret = 0; |
| |
| /* Pad out an encrypted transfer to the block size. */ |
| start = round_down(start, bsize); |
| size += rreq->start - start; |
| size = round_up(size, bsize); |
| |
| do { |
| struct netfs_io_subrequest *subreq; |
| ssize_t slice; |
| |
| subreq = netfs_alloc_subrequest(rreq, NETFS_DOWNLOAD_FROM_SERVER); |
| if (!subreq) { |
| ret = -ENOMEM; |
| break; |
| } |
| |
| subreq->start = start; |
| subreq->len = size; |
| |
| __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); |
| |
| spin_lock(&rreq->lock); |
| list_add_tail(&subreq->rreq_link, &stream->subrequests); |
| if (list_is_first(&subreq->rreq_link, &stream->subrequests)) { |
| stream->front = subreq; |
| if (!stream->active) { |
| stream->collected_to = stream->front->start; |
| /* Store list pointers before active flag */ |
| smp_store_release(&stream->active, true); |
| } |
| } |
| trace_netfs_sreq(subreq, netfs_sreq_trace_added); |
| spin_unlock(&rreq->lock); |
| |
| netfs_stat(&netfs_n_rh_download); |
| if (rreq->netfs_ops->prepare_read) { |
| ret = rreq->netfs_ops->prepare_read(subreq); |
| if (ret < 0) { |
| netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel); |
| break; |
| } |
| } |
| |
| /* Make sure we have sufficient bounce bufferage. */ |
| if (test_bit(NETFS_RREQ_USE_BOUNCE_BUFFER, &rreq->flags)) { |
| ret = netfs_alloc_bounce(rreq, subreq->start + subreq->len, GFP_KERNEL); |
| if (ret < 0) { |
| netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel); |
| break; |
| } |
| } |
| |
| netfs_prepare_dio_read_iterator(subreq, bsize); |
| slice = subreq->len; |
| size -= slice; |
| start += slice; |
| rreq->submitted += slice; |
| if (size <= 0) { |
| smp_wmb(); /* Write lists before ALL_QUEUED. */ |
| set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); |
| } |
| |
| rreq->netfs_ops->issue_read(subreq); |
| |
| if (test_bit(NETFS_RREQ_PAUSE, &rreq->flags)) |
| netfs_wait_for_paused_read(rreq); |
| if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) |
| break; |
| if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) && |
| test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags)) |
| break; |
| cond_resched(); |
| } while (size > 0); |
| |
| if (unlikely(size > 0)) { |
| smp_wmb(); /* Write lists before ALL_QUEUED. */ |
| set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); |
| netfs_wake_collector(rreq); |
| } |
| |
| return ret; |
| } |
| |
| /* |
| * Perform a read to an application buffer, bypassing the pagecache and the |
| * local disk cache. |
| */ |
| static ssize_t netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync) |
| { |
| ssize_t ret; |
| |
| _enter("R=%x %llx-%llx", |
| rreq->debug_id, rreq->start, rreq->start + rreq->len - 1); |
| |
| if (rreq->len == 0) { |
| pr_err("Zero-sized read [R=%x]\n", rreq->debug_id); |
| return -EIO; |
| } |
| |
| // TODO: Use bounce buffer if requested |
| |
| inode_dio_begin(rreq->inode); |
| |
| ret = netfs_dispatch_unbuffered_reads(rreq); |
| |
| if (!rreq->submitted) { |
| netfs_put_request(rreq, netfs_rreq_trace_put_no_submit); |
| inode_dio_end(rreq->inode); |
| ret = 0; |
| goto out; |
| } |
| |
| if (sync) |
| ret = netfs_wait_for_read(rreq); |
| else |
| ret = -EIOCBQUEUED; |
| out: |
| _leave(" = %zd", ret); |
| return ret; |
| } |
| |
| /** |
| * netfs_unbuffered_read_iter_locked - Perform an unbuffered or direct I/O read |
| * @iocb: The I/O control descriptor describing the read |
| * @iter: The output buffer (also specifies read length) |
| * |
| * Perform an unbuffered I/O or direct I/O from the file in @iocb to the |
| * output buffer. No use is made of the pagecache. |
| * |
| * The caller must hold any appropriate locks. |
| */ |
| ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *iter) |
| { |
| struct netfs_io_request *rreq; |
| ssize_t ret; |
| size_t orig_count = iov_iter_count(iter); |
| bool sync = is_sync_kiocb(iocb); |
| |
| _enter(""); |
| |
| if (!orig_count) |
| return 0; /* Don't update atime */ |
| |
| ret = kiocb_write_and_wait(iocb, orig_count); |
| if (ret < 0) |
| return ret; |
| file_accessed(iocb->ki_filp); |
| |
| rreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp, |
| iocb->ki_pos, orig_count, NULL, |
| iocb->ki_flags & IOCB_DIRECT ? |
| NETFS_DIO_READ : NETFS_UNBUFFERED_READ); |
| if (IS_ERR(rreq)) |
| return PTR_ERR(rreq); |
| |
| netfs_stat(&netfs_n_rh_dio_read); |
| trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_dio_read); |
| |
| /* If this is an async op, we have to keep track of the destination |
| * buffer for ourselves as the caller's iterator will be trashed when |
| * we return. |
| * |
| * In such a case, extract an iterator to represent as much of the the |
| * output buffer as we can manage. Note that the extraction might not |
| * be able to allocate a sufficiently large bvec array and may shorten |
| * the request. |
| */ |
| if (user_backed_iter(iter)) { |
| ret = netfs_extract_user_iter(iter, rreq->len, &rreq->buffer.iter, 0); |
| if (ret < 0) |
| goto out; |
| rreq->direct_bv = (struct bio_vec *)rreq->buffer.iter.bvec; |
| rreq->direct_bv_count = ret; |
| rreq->direct_bv_unpin = iov_iter_extract_will_pin(iter); |
| rreq->len = iov_iter_count(&rreq->buffer.iter); |
| } else { |
| rreq->buffer.iter = *iter; |
| rreq->len = orig_count; |
| rreq->direct_bv_unpin = false; |
| iov_iter_advance(iter, orig_count); |
| } |
| |
| /* If we're going to do decryption or decompression, we're going to |
| * need a bounce buffer. If the output buffer is correctly aligned and |
| * correctly sized for the crypto algorithm, we get a free copy between |
| * buffers from the crypto; if misaligned, we decrypt in place in the |
| * bounce buffer and then copy. |
| */ |
| if (test_bit(NETFS_RREQ_CONTENT_ENCRYPTION, &rreq->flags)) { |
| if (!netfs_is_crypto_aligned(rreq, iter)) |
| __set_bit(NETFS_RREQ_CRYPT_IN_PLACE, &rreq->flags); |
| __set_bit(NETFS_RREQ_USE_BOUNCE_BUFFER, &rreq->flags); |
| } |
| |
| /* Set up the bounce buffer if we need it. Allow for padding the |
| * request out to the crypo block size and allocate at least one folio |
| * into it. |
| */ |
| if (test_bit(NETFS_RREQ_USE_BOUNCE_BUFFER, &rreq->flags)) { |
| size_t bsize = rreq->crypto_bsize; |
| |
| rreq->bounce_alloc_to = round_down(rreq->start, bsize); |
| |
| ret = rolling_buffer_init(&rreq->bounce, rreq->debug_id, ITER_DEST); |
| if (ret < 0) |
| goto out; |
| |
| ret = netfs_alloc_bounce(rreq, rreq->bounce_alloc_to + bsize, GFP_KERNEL); |
| if (ret < 0) |
| goto out; |
| } |
| |
| if (!sync) { |
| rreq->iocb = iocb; |
| __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags); |
| } |
| |
| ret = netfs_unbuffered_read(rreq, sync); |
| if (ret < 0) |
| goto out; /* May be -EIOCBQUEUED */ |
| if (sync) { |
| ret = netfs_dio_copy_bounce_to_dest(rreq, iocb); |
| if (ret == 0) { |
| iocb->ki_pos += rreq->transferred; |
| ret = rreq->transferred; |
| } |
| } |
| |
| out: |
| netfs_put_request(rreq, netfs_rreq_trace_put_return); |
| if (ret > 0) |
| orig_count -= ret; |
| return ret; |
| } |
| EXPORT_SYMBOL(netfs_unbuffered_read_iter_locked); |
| |
| /** |
| * netfs_unbuffered_read_iter - Perform an unbuffered or direct I/O read |
| * @iocb: The I/O control descriptor describing the read |
| * @iter: The output buffer (also specifies read length) |
| * |
| * Perform an unbuffered I/O or direct I/O from the file in @iocb to the |
| * output buffer. No use is made of the pagecache. |
| */ |
| ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter) |
| { |
| struct inode *inode = file_inode(iocb->ki_filp); |
| ssize_t ret; |
| |
| if (!iter->count) |
| return 0; /* Don't update atime */ |
| |
| ret = netfs_start_io_direct(inode); |
| if (ret == 0) { |
| ret = netfs_unbuffered_read_iter_locked(iocb, iter); |
| netfs_end_io_direct(inode); |
| } |
| return ret; |
| } |
| EXPORT_SYMBOL(netfs_unbuffered_read_iter); |
| |
| /** |
| * netfs_unbuffered_read_from_inode - Perform an unbuffered sync I/O read |
| * @inode: The inode being accessed |
| * @pos: The file position to read from |
| * @iter: The output buffer (also specifies read length) |
| * @nohole: True to return short/ENODATA if hole encountered |
| * |
| * Perform a synchronous unbuffered I/O from the inode to the output buffer. |
| * No use is made of the pagecache. The output buffer must be suitably aligned |
| * if content encryption is to be used. If @nohole is true then the read will |
| * stop short if a hole is encountered and return -ENODATA if the read begins |
| * with a hole. |
| * |
| * The caller must hold any appropriate locks. |
| */ |
| ssize_t netfs_unbuffered_read_from_inode(struct inode *inode, loff_t pos, |
| struct iov_iter *iter, bool nohole) |
| { |
| struct netfs_io_request *rreq; |
| ssize_t ret; |
| size_t orig_count = iov_iter_count(iter); |
| |
| _enter(""); |
| |
| if (WARN_ON(user_backed_iter(iter))) |
| return -EIO; |
| |
| if (!orig_count) |
| return 0; /* Don't update atime */ |
| |
| ret = filemap_write_and_wait_range(inode->i_mapping, pos, orig_count); |
| if (ret < 0) |
| return ret; |
| inode_update_time(inode, S_ATIME); |
| |
| rreq = netfs_alloc_request(inode->i_mapping, NULL, pos, orig_count, |
| NULL, NETFS_UNBUFFERED_READ); |
| if (IS_ERR(rreq)) |
| return PTR_ERR(rreq); |
| |
| ret = -EIO; |
| if (test_bit(NETFS_RREQ_CONTENT_ENCRYPTION, &rreq->flags) && |
| WARN_ON(!netfs_is_crypto_aligned(rreq, iter))) |
| goto out; |
| |
| netfs_stat(&netfs_n_rh_dio_read); |
| trace_netfs_read(rreq, rreq->start, rreq->len, |
| netfs_read_trace_unbuffered_read_from_inode); |
| |
| rreq->buffer.iter = *iter; |
| rreq->len = orig_count; |
| rreq->direct_bv_unpin = false; |
| iov_iter_advance(iter, orig_count); |
| |
| if (nohole) |
| __set_bit(NETFS_RREQ_NO_READ_HOLE, &rreq->flags); |
| |
| /* We're going to do the crypto in place in the destination buffer. */ |
| if (test_bit(NETFS_RREQ_CONTENT_ENCRYPTION, &rreq->flags)) |
| __set_bit(NETFS_RREQ_CRYPT_IN_PLACE, &rreq->flags); |
| |
| ret = netfs_dispatch_unbuffered_reads(rreq); |
| |
| if (!rreq->submitted) { |
| netfs_put_request(rreq, netfs_rreq_trace_put_no_submit); |
| goto out; |
| } |
| |
| ret = netfs_wait_for_read(rreq); |
| out: |
| netfs_put_request(rreq, netfs_rreq_trace_put_return); |
| return ret; |
| } |
| EXPORT_SYMBOL(netfs_unbuffered_read_from_inode); |