| // SPDX-License-Identifier: GPL-2.0-or-later |
| /* Unbuffered and direct write support. |
| * |
| * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. |
| * Written by David Howells (dhowells@redhat.com) |
| */ |
| |
| #include <linux/export.h> |
| #include <linux/uio.h> |
| #include "internal.h" |
| |
| static void netfs_cleanup_dio_write(struct netfs_io_request *wreq) |
| { |
| struct inode *inode = wreq->inode; |
| unsigned long long end = wreq->start + wreq->transferred; |
| |
| if (!wreq->error && |
| i_size_read(inode) < end) { |
| if (wreq->netfs_ops->update_i_size) |
| wreq->netfs_ops->update_i_size(inode, end); |
| else |
| i_size_write(inode, end); |
| } |
| } |
| |
| /* |
| * Write some data from a bounce buffer folio to the server. |
| */ |
| static int netfs_write_bounce(struct netfs_io_request *wreq, struct folio *folio, |
| size_t foff, size_t flen) |
| { |
| struct netfs_io_stream *stream = &wreq->io_streams[0]; |
| size_t fsize = folio_size(folio); |
| loff_t fpos = folio_pos(folio); |
| bool debug = false; |
| |
| _enter("%zx,%zx", foff, flen); |
| |
| //stream->sreq_max_len = fsize; |
| stream->submit_off = foff; |
| stream->submit_len = flen; |
| |
| /* Attach the folio to one or more subrequests. For a big folio, we |
| * could end up with thousands of subrequests if the wsize is small - |
| * but we might need to wait during the creation of subrequests for |
| * network resources (eg. SMB credits). |
| */ |
| do { |
| ssize_t part; |
| |
| part = netfs_advance_write(wreq, stream, fpos + stream->submit_off, |
| stream->submit_len, false); |
| atomic64_set(&wreq->issued_to, fpos + stream->submit_off); |
| stream->submit_off += part; |
| stream->sreq_max_len -= part; |
| if (part > stream->submit_len) |
| stream->submit_len = 0; |
| else |
| stream->submit_len -= part; |
| if (part > 0) |
| debug = true; |
| cond_resched(); |
| } while (stream->submit_len > 0); |
| |
| atomic64_set(&wreq->issued_to, fpos + fsize); |
| |
| if (!debug) |
| kdebug("R=%x: No submit", wreq->debug_id); |
| |
| _leave(" = 0"); |
| return 0; |
| } |
| |
| /* |
| * Insert pages into the bounce buffer and fill them, dispatching writes to |
| * cover them as we go. If there are gaps at either end of the bounce buffer |
| * that need to be filled, we download the content for those from the server |
| * and perform an RMW cycle. |
| */ |
| static ssize_t netfs_write_through_bounce_buffer(struct netfs_io_request *wreq, |
| struct kiocb *iocb, struct iov_iter *iter, |
| struct netfs_group *netfs_group) |
| { |
| struct netfs_io_stream *upload = &wreq->io_streams[0]; |
| struct netfs_inode *ictx = netfs_inode(wreq->inode); |
| struct folio_queue *fq; |
| unsigned long long real_size = ictx->remote_i_size; |
| unsigned long long start = wreq->start; |
| unsigned long long end = start + iov_iter_count(iter); |
| size_t bsize = wreq->crypto_bsize; |
| size_t bmask = bsize - 1; |
| size_t len = iov_iter_count(iter); |
| size_t gap_before = start & bmask; |
| size_t gap_after = (bsize - (gap_before + len)) & bmask; |
| size_t written = 0; |
| int ret, slot; |
| |
| _enter("%llx-%llx", start, end); |
| |
| /* Assume crypto involves units of >1 byte. */ |
| if (WARN_ON_ONCE(bsize <= 1)) |
| return -EIO; |
| |
| /* The real size must be rounded out to the crypto block size plus |
| * any trailer we might want to attach. |
| */ |
| if (real_size && wreq->crypto_bsize) { |
| if (real_size < wreq->crypto_trailer) |
| return -EIO; |
| if ((real_size - wreq->crypto_trailer) & bmask) |
| return -EIO; |
| real_size -= wreq->crypto_trailer; |
| } |
| |
| if (wreq->origin == NETFS_DIO_WRITE) |
| inode_dio_begin(wreq->inode); |
| |
| /* Prime the bounce buffer since we're going to need at least |
| * one folio therein. |
| */ |
| ret = rolling_buffer_init(&wreq->bounce, wreq->debug_id, ITER_SOURCE); |
| if (ret < 0) |
| goto error; |
| wreq->bounce_alloc_to = round_down(start, umax(bsize, PAGE_SIZE)); |
| ret = netfs_alloc_bounce(wreq, round_down(start, bsize) + bsize, GFP_KERNEL); |
| if (ret < 0) |
| goto error; |
| fq = wreq->bounce.tail; |
| slot = 0; |
| |
| /* Populate the buffer for any RMW we need to do at either end of the |
| * bounce buffer and perform the read(s). |
| */ |
| if (gap_before || gap_after) { |
| struct folio *folio1 = folioq_folio(fq, slot); |
| struct folio *folio2 = NULL; |
| unsigned long long gstart = start - gap_before; |
| unsigned long long gend = start + len + gap_after; |
| bool read_before = false, read_after = false; |
| bool overlap = round_down(gend - bsize, PAGE_SIZE) <= |
| round_down(gstart, PAGE_SIZE); |
| |
| wreq->rmw_gap_before = gap_before; |
| wreq->rmw_gap_after = gap_after; |
| wreq->start = gstart; |
| wreq->len = gend - gstart; |
| |
| __set_bit(NETFS_RREQ_RMW, &ictx->flags); |
| if (gstart >= end) { |
| /* At or after EOF, nothing to read. */ |
| } else { |
| if (gap_before) |
| read_before = true; /* Part of first gap before EOF. */ |
| |
| if (!gap_after) |
| ; |
| else if (gend - bsize <= gstart) |
| ; /* All in one crypto block. */ |
| else if (gend - bsize >= real_size) |
| ; /* Last block at/after EOF. */ |
| else |
| read_after = true; /* Part of last crypto block before EOF. */ |
| } |
| |
| _debug("gaps=%zx,%zx ov=%u read=%u,%u", |
| gap_before, gap_after, overlap, read_before, read_after); |
| |
| if (gap_after && !overlap) { |
| unsigned int shift = umax(ilog2(bsize), PAGE_SHIFT); |
| unsigned int order = shift - PAGE_SHIFT; |
| |
| ret = -ENOMEM; |
| folio2 = folio_alloc(GFP_KERNEL, order); |
| if (!folio2) |
| goto error; |
| |
| folio2->index = (gend - folio_size(folio2)) / PAGE_SIZE; |
| wreq->rmw_tail = folio2; |
| __set_bit(NETFS_RREQ_PUT_RMW_TAIL, &wreq->flags); |
| } else if (gap_after && overlap) { |
| wreq->rmw_tail = folio1; |
| folio2 = folio1; |
| } |
| |
| if (gap_before && !read_before) |
| folio_zero_segment(folio1, 0, offset_in_folio(folio1, start)); |
| |
| if (gap_after && !read_after) |
| folio_zero_segment(folio2, offset_in_folio(folio2, end), |
| folio_size(folio2)); |
| |
| if (read_before || read_after) { |
| ret = netfs_rmw_read(wreq, iocb->ki_filp, |
| gstart, gap_before, |
| gend - gap_after, gap_after); |
| if (ret < 0) |
| goto error; |
| } |
| |
| start = wreq->start; |
| len = wreq->len; |
| } |
| |
| /* Find out subreq limits. */ |
| netfs_prepare_write(wreq, upload, start); |
| |
| /* Copy the data in progressively, dispatching writes as we go. */ |
| while ((len = iov_iter_count(iter))) { |
| struct iov_iter tmp = *iter; |
| struct folio *folio; |
| ssize_t copied; |
| size_t foff, fsize, part; |
| |
| /* Get the bounce folio we're going to copy/encrypt the data |
| * to. Note that we use multipage folios if bsize > PAGE_SIZE. |
| */ |
| ret = netfs_alloc_bounce(wreq, start + umin(len, bsize), GFP_KERNEL); |
| if (ret < 0) |
| break; |
| |
| if (slot == folioq_nr_slots(fq)) { |
| fq = fq->next; |
| slot = 0; |
| } |
| |
| folio = folioq_folio(fq, slot); |
| fsize = folioq_folio_size(fq, slot); |
| foff = offset_in_folio(folio, start + gap_before); |
| WARN_ON_ONCE(folio_pos(folio) > start); |
| |
| part = umin(fsize - foff, len); |
| |
| /* If this next stretch of data aligns with the crypto |
| * algorithm block size, we can get a free copy from the |
| * encryption algorithm. |
| */ |
| iov_iter_truncate(&tmp, part); |
| if (!gap_before && |
| !(gap_after && part == len) && |
| netfs_is_crypto_aligned(wreq, &tmp)) { |
| if (!netfs_encrypt_to_folio(wreq, folio, iter, start, part, GFP_KERNEL)) { |
| ret = wreq->error; |
| break; |
| } |
| copied = part; |
| } else { |
| copied = copy_folio_from_iter(folio, foff, part, iter); |
| if (copied < part) |
| break; |
| copied += gap_before; |
| if (part == len) |
| copied += gap_after; |
| if (!netfs_encrypt_folio(wreq, folio, folio, start, copied, GFP_KERNEL)) { |
| ret = wreq->error; |
| break; |
| } |
| } |
| |
| ret = netfs_write_bounce(wreq, folio, offset_in_folio(folio, start), copied); |
| if (ret < 0) |
| break; |
| start += copied; |
| if (part < len) { |
| written += copied; |
| written -= gap_before; |
| } else { |
| written += part; |
| } |
| if (foff + part >= fsize) |
| slot++; |
| gap_before = 0; |
| cond_resched(); |
| } |
| |
| error: |
| netfs_issue_write(wreq, upload); |
| smp_wmb(); /* Write lists before ALL_QUEUED. */ |
| set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags); |
| if (list_empty(&upload->subrequests)) |
| netfs_wake_collector(wreq); |
| _leave(" = %d [%zx]", ret, written); |
| return written ?: ret; |
| } |
| |
| /* |
| * Perform an unbuffered write where we may have to do an RMW operation on an |
| * encrypted file. This can also be used for direct I/O writes. |
| */ |
| ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter, |
| struct netfs_group *netfs_group) |
| { |
| struct netfs_io_request *wreq; |
| struct netfs_inode *ctx = netfs_inode(file_inode(iocb->ki_filp)); |
| unsigned long long start = iocb->ki_pos; |
| unsigned long long end = start + iov_iter_count(iter); |
| ssize_t ret, n; |
| size_t len = iov_iter_count(iter); |
| bool async = !is_sync_kiocb(iocb); |
| |
| _enter(""); |
| |
| _debug("uw %llx-%llx", start, end); |
| |
| wreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp, start, NULL, |
| iocb->ki_flags & IOCB_DIRECT ? |
| NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE); |
| if (IS_ERR(wreq)) |
| return PTR_ERR(wreq); |
| |
| wreq->io_streams[0].avail = true; |
| wreq->cleanup = netfs_cleanup_dio_write; |
| wreq->len = iov_iter_count(iter); |
| trace_netfs_write(wreq, (iocb->ki_flags & IOCB_DIRECT ? |
| netfs_write_trace_dio_write : |
| netfs_write_trace_unbuffered_write)); |
| |
| /* We're going to need a bounce buffer if what we transmit is going to |
| * be different in some way to the source buffer, e.g. because it gets |
| * encrypted/compressed or because it needs expanding to a block size. |
| */ |
| if (test_bit(NETFS_ICTX_ENCRYPTED, &ctx->flags)) { |
| ret = netfs_write_through_bounce_buffer(wreq, iocb, iter, netfs_group); |
| goto done; |
| } |
| |
| /* If this is an async op and we're not using a bounce buffer, we have |
| * to save the source buffer as the iterator is only good until we |
| * return. In such a case, extract an iterator to represent as much of |
| * the the output buffer as we can manage. Note that the extraction |
| * might not be able to allocate a sufficiently large bvec array and |
| * may shorten the request. |
| */ |
| if (user_backed_iter(iter)) { |
| n = netfs_extract_user_iter(iter, len, &wreq->buffer.iter, 0); |
| if (n < 0) { |
| ret = n; |
| goto out; |
| } |
| wreq->direct_bv = (struct bio_vec *)wreq->buffer.iter.bvec; |
| wreq->direct_bv_count = n; |
| wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter); |
| } else { |
| /* If this is a kernel-generated async DIO request, assume that |
| * any resources the iterator points to (eg. a bio_vec array) |
| * will persist till the end of the op. |
| */ |
| wreq->buffer.iter = *iter; |
| } |
| |
| __set_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags); |
| if (async) |
| __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &wreq->flags); |
| |
| /* Dispatch the write. */ |
| __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); |
| if (async) |
| wreq->iocb = iocb; |
| wreq->len = iov_iter_count(&wreq->buffer.iter); |
| ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), wreq->len); |
| done: |
| if (ret < 0) { |
| _debug("begin = %zd", ret); |
| goto out; |
| } |
| |
| if (!async) { |
| trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip); |
| ret = netfs_wait_for_write(wreq); |
| if (ret > 0) |
| iocb->ki_pos += ret; |
| } else { |
| ret = -EIOCBQUEUED; |
| } |
| |
| out: |
| netfs_put_request(wreq, netfs_rreq_trace_put_return); |
| return ret; |
| } |
| EXPORT_SYMBOL(netfs_unbuffered_write_iter_locked); |
| |
| /** |
| * netfs_unbuffered_write_iter - Unbuffered write to a file |
| * @iocb: IO state structure |
| * @from: iov_iter with data to write |
| * |
| * Do an unbuffered write to a file, writing the data directly to the server |
| * and not lodging the data in the pagecache. |
| * |
| * Return: |
| * * Negative error code if no data has been written at all of |
| * vfs_fsync_range() failed for a synchronous write |
| * * Number of bytes written, even for truncated writes |
| */ |
| ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from) |
| { |
| struct file *file = iocb->ki_filp; |
| struct address_space *mapping = file->f_mapping; |
| struct inode *inode = mapping->host; |
| struct netfs_inode *ictx = netfs_inode(inode); |
| ssize_t ret; |
| loff_t pos = iocb->ki_pos; |
| unsigned long long end = pos + iov_iter_count(from) - 1; |
| |
| _enter("%llx,%zx,%llx", pos, iov_iter_count(from), i_size_read(inode)); |
| |
| if (!iov_iter_count(from)) |
| return 0; |
| |
| trace_netfs_write_iter(iocb, from); |
| netfs_stat(&netfs_n_wh_dio_write); |
| |
| ret = netfs_start_io_direct(inode); |
| if (ret < 0) |
| return ret; |
| ret = generic_write_checks(iocb, from); |
| if (ret <= 0) |
| goto out; |
| ret = file_remove_privs(file); |
| if (ret < 0) |
| goto out; |
| ret = file_update_time(file); |
| if (ret < 0) |
| goto out; |
| if (iocb->ki_flags & IOCB_NOWAIT) { |
| /* We could block if there are any pages in the range. */ |
| ret = -EAGAIN; |
| if (filemap_range_has_page(mapping, pos, end)) |
| if (filemap_invalidate_inode(inode, true, pos, end)) |
| goto out; |
| } else { |
| ret = filemap_write_and_wait_range(mapping, pos, end); |
| if (ret < 0) |
| goto out; |
| } |
| |
| /* |
| * After a write we want buffered reads to be sure to go to disk to get |
| * the new data. We invalidate clean cached page from the region we're |
| * about to write. We do this *before* the write so that we can return |
| * without clobbering -EIOCBQUEUED from ->direct_IO(). |
| */ |
| ret = filemap_invalidate_inode(inode, true, pos, end); |
| if (ret < 0) |
| goto out; |
| end = iocb->ki_pos + iov_iter_count(from); |
| if (end > ictx->zero_point) |
| ictx->zero_point = end; |
| |
| fscache_invalidate(netfs_i_cookie(ictx), NULL, i_size_read(inode), |
| FSCACHE_INVAL_DIO_WRITE); |
| ret = netfs_unbuffered_write_iter_locked(iocb, from, NULL); |
| out: |
| netfs_end_io_direct(inode); |
| return ret; |
| } |
| EXPORT_SYMBOL(netfs_unbuffered_write_iter); |