blob: 993301870e44d024bf285f30cd15c172a176a972 [file] [log] [blame] [edit]
// SPDX-License-Identifier: GPL-2.0-or-later
/* Unbuffered and direct write support.
*
* Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
#include <linux/export.h>
#include <linux/uio.h>
#include "internal.h"
static void netfs_cleanup_dio_write(struct netfs_io_request *wreq)
{
struct inode *inode = wreq->inode;
unsigned long long end = wreq->start + wreq->transferred;
if (!wreq->error &&
i_size_read(inode) < end) {
if (wreq->netfs_ops->update_i_size)
wreq->netfs_ops->update_i_size(inode, end);
else
i_size_write(inode, end);
}
}
/*
* Write some data from a bounce buffer folio to the server.
*/
static int netfs_write_bounce(struct netfs_io_request *wreq, struct folio *folio,
size_t foff, size_t flen)
{
struct netfs_io_stream *stream = &wreq->io_streams[0];
size_t fsize = folio_size(folio);
loff_t fpos = folio_pos(folio);
bool debug = false;
_enter("%zx,%zx", foff, flen);
//stream->sreq_max_len = fsize;
stream->submit_off = foff;
stream->submit_len = flen;
/* Attach the folio to one or more subrequests. For a big folio, we
* could end up with thousands of subrequests if the wsize is small -
* but we might need to wait during the creation of subrequests for
* network resources (eg. SMB credits).
*/
do {
ssize_t part;
part = netfs_advance_write(wreq, stream, fpos + stream->submit_off,
stream->submit_len, false);
atomic64_set(&wreq->issued_to, fpos + stream->submit_off);
stream->submit_off += part;
stream->sreq_max_len -= part;
if (part > stream->submit_len)
stream->submit_len = 0;
else
stream->submit_len -= part;
if (part > 0)
debug = true;
cond_resched();
} while (stream->submit_len > 0);
atomic64_set(&wreq->issued_to, fpos + fsize);
if (!debug)
kdebug("R=%x: No submit", wreq->debug_id);
_leave(" = 0");
return 0;
}
/*
* Insert pages into the bounce buffer and fill them, dispatching writes to
* cover them as we go. If there are gaps at either end of the bounce buffer
* that need to be filled, we download the content for those from the server
* and perform an RMW cycle.
*/
static ssize_t netfs_write_through_bounce_buffer(struct netfs_io_request *wreq,
struct kiocb *iocb, struct iov_iter *iter,
struct netfs_group *netfs_group)
{
struct netfs_io_stream *upload = &wreq->io_streams[0];
struct netfs_inode *ictx = netfs_inode(wreq->inode);
struct folio_queue *fq;
unsigned long long real_size = ictx->remote_i_size;
unsigned long long start = wreq->start;
unsigned long long end = start + iov_iter_count(iter);
size_t bsize = wreq->crypto_bsize;
size_t bmask = bsize - 1;
size_t len = iov_iter_count(iter);
size_t gap_before = start & bmask;
size_t gap_after = (bsize - (gap_before + len)) & bmask;
size_t written = 0;
int ret, slot;
_enter("%llx-%llx", start, end);
/* Assume crypto involves units of >1 byte. */
if (WARN_ON_ONCE(bsize <= 1))
return -EIO;
/* The real size must be rounded out to the crypto block size plus
* any trailer we might want to attach.
*/
if (real_size && wreq->crypto_bsize) {
if (real_size < wreq->crypto_trailer)
return -EIO;
if ((real_size - wreq->crypto_trailer) & bmask)
return -EIO;
real_size -= wreq->crypto_trailer;
}
if (wreq->origin == NETFS_DIO_WRITE)
inode_dio_begin(wreq->inode);
/* Prime the bounce buffer since we're going to need at least
* one folio therein.
*/
ret = rolling_buffer_init(&wreq->bounce, wreq->debug_id, ITER_SOURCE);
if (ret < 0)
goto error;
wreq->bounce_alloc_to = round_down(start, umax(bsize, PAGE_SIZE));
ret = netfs_alloc_bounce(wreq, round_down(start, bsize) + bsize, GFP_KERNEL);
if (ret < 0)
goto error;
fq = wreq->bounce.tail;
slot = 0;
/* Populate the buffer for any RMW we need to do at either end of the
* bounce buffer and perform the read(s).
*/
if (gap_before || gap_after) {
struct folio *folio1 = folioq_folio(fq, slot);
struct folio *folio2 = NULL;
unsigned long long gstart = start - gap_before;
unsigned long long gend = start + len + gap_after;
bool read_before = false, read_after = false;
bool overlap = round_down(gend - bsize, PAGE_SIZE) <=
round_down(gstart, PAGE_SIZE);
wreq->rmw_gap_before = gap_before;
wreq->rmw_gap_after = gap_after;
wreq->start = gstart;
wreq->len = gend - gstart;
__set_bit(NETFS_RREQ_RMW, &ictx->flags);
if (gstart >= end) {
/* At or after EOF, nothing to read. */
} else {
if (gap_before)
read_before = true; /* Part of first gap before EOF. */
if (!gap_after)
;
else if (gend - bsize <= gstart)
; /* All in one crypto block. */
else if (gend - bsize >= real_size)
; /* Last block at/after EOF. */
else
read_after = true; /* Part of last crypto block before EOF. */
}
_debug("gaps=%zx,%zx ov=%u read=%u,%u",
gap_before, gap_after, overlap, read_before, read_after);
if (gap_after && !overlap) {
unsigned int shift = umax(ilog2(bsize), PAGE_SHIFT);
unsigned int order = shift - PAGE_SHIFT;
ret = -ENOMEM;
folio2 = folio_alloc(GFP_KERNEL, order);
if (!folio2)
goto error;
folio2->index = (gend - folio_size(folio2)) / PAGE_SIZE;
wreq->rmw_tail = folio2;
__set_bit(NETFS_RREQ_PUT_RMW_TAIL, &wreq->flags);
} else if (gap_after && overlap) {
wreq->rmw_tail = folio1;
folio2 = folio1;
}
if (gap_before && !read_before)
folio_zero_segment(folio1, 0, offset_in_folio(folio1, start));
if (gap_after && !read_after)
folio_zero_segment(folio2, offset_in_folio(folio2, end),
folio_size(folio2));
if (read_before || read_after) {
ret = netfs_rmw_read(wreq, iocb->ki_filp,
gstart, gap_before,
gend - gap_after, gap_after);
if (ret < 0)
goto error;
}
start = wreq->start;
len = wreq->len;
}
/* Find out subreq limits. */
netfs_prepare_write(wreq, upload, start);
/* Copy the data in progressively, dispatching writes as we go. */
while ((len = iov_iter_count(iter))) {
struct iov_iter tmp = *iter;
struct folio *folio;
ssize_t copied;
size_t foff, fsize, part;
/* Get the bounce folio we're going to copy/encrypt the data
* to. Note that we use multipage folios if bsize > PAGE_SIZE.
*/
ret = netfs_alloc_bounce(wreq, start + umin(len, bsize), GFP_KERNEL);
if (ret < 0)
break;
if (slot == folioq_nr_slots(fq)) {
fq = fq->next;
slot = 0;
}
folio = folioq_folio(fq, slot);
fsize = folioq_folio_size(fq, slot);
foff = offset_in_folio(folio, start + gap_before);
WARN_ON_ONCE(folio_pos(folio) > start);
part = umin(fsize - foff, len);
/* If this next stretch of data aligns with the crypto
* algorithm block size, we can get a free copy from the
* encryption algorithm.
*/
iov_iter_truncate(&tmp, part);
if (!gap_before &&
!(gap_after && part == len) &&
netfs_is_crypto_aligned(wreq, &tmp)) {
if (!netfs_encrypt_to_folio(wreq, folio, iter, start, part, GFP_KERNEL)) {
ret = wreq->error;
break;
}
copied = part;
} else {
copied = copy_folio_from_iter(folio, foff, part, iter);
if (copied < part)
break;
copied += gap_before;
if (part == len)
copied += gap_after;
if (!netfs_encrypt_folio(wreq, folio, folio, start, copied, GFP_KERNEL)) {
ret = wreq->error;
break;
}
}
ret = netfs_write_bounce(wreq, folio, offset_in_folio(folio, start), copied);
if (ret < 0)
break;
start += copied;
if (part < len) {
written += copied;
written -= gap_before;
} else {
written += part;
}
if (foff + part >= fsize)
slot++;
gap_before = 0;
cond_resched();
}
error:
netfs_issue_write(wreq, upload);
smp_wmb(); /* Write lists before ALL_QUEUED. */
set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags);
if (list_empty(&upload->subrequests))
netfs_wake_collector(wreq);
_leave(" = %d [%zx]", ret, written);
return written ?: ret;
}
/*
* Perform an unbuffered write where we may have to do an RMW operation on an
* encrypted file. This can also be used for direct I/O writes.
*/
ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter,
struct netfs_group *netfs_group)
{
struct netfs_io_request *wreq;
struct netfs_inode *ctx = netfs_inode(file_inode(iocb->ki_filp));
unsigned long long start = iocb->ki_pos;
unsigned long long end = start + iov_iter_count(iter);
ssize_t ret, n;
size_t len = iov_iter_count(iter);
bool async = !is_sync_kiocb(iocb);
_enter("");
_debug("uw %llx-%llx", start, end);
wreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp, start, NULL,
iocb->ki_flags & IOCB_DIRECT ?
NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE);
if (IS_ERR(wreq))
return PTR_ERR(wreq);
wreq->io_streams[0].avail = true;
wreq->cleanup = netfs_cleanup_dio_write;
wreq->len = iov_iter_count(iter);
trace_netfs_write(wreq, (iocb->ki_flags & IOCB_DIRECT ?
netfs_write_trace_dio_write :
netfs_write_trace_unbuffered_write));
/* We're going to need a bounce buffer if what we transmit is going to
* be different in some way to the source buffer, e.g. because it gets
* encrypted/compressed or because it needs expanding to a block size.
*/
if (test_bit(NETFS_ICTX_ENCRYPTED, &ctx->flags)) {
ret = netfs_write_through_bounce_buffer(wreq, iocb, iter, netfs_group);
goto done;
}
/* If this is an async op and we're not using a bounce buffer, we have
* to save the source buffer as the iterator is only good until we
* return. In such a case, extract an iterator to represent as much of
* the the output buffer as we can manage. Note that the extraction
* might not be able to allocate a sufficiently large bvec array and
* may shorten the request.
*/
if (user_backed_iter(iter)) {
n = netfs_extract_user_iter(iter, len, &wreq->buffer.iter, 0);
if (n < 0) {
ret = n;
goto out;
}
wreq->direct_bv = (struct bio_vec *)wreq->buffer.iter.bvec;
wreq->direct_bv_count = n;
wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
} else {
/* If this is a kernel-generated async DIO request, assume that
* any resources the iterator points to (eg. a bio_vec array)
* will persist till the end of the op.
*/
wreq->buffer.iter = *iter;
}
__set_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags);
if (async)
__set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &wreq->flags);
/* Dispatch the write. */
__set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
if (async)
wreq->iocb = iocb;
wreq->len = iov_iter_count(&wreq->buffer.iter);
ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), wreq->len);
done:
if (ret < 0) {
_debug("begin = %zd", ret);
goto out;
}
if (!async) {
trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip);
ret = netfs_wait_for_write(wreq);
if (ret > 0)
iocb->ki_pos += ret;
} else {
ret = -EIOCBQUEUED;
}
out:
netfs_put_request(wreq, netfs_rreq_trace_put_return);
return ret;
}
EXPORT_SYMBOL(netfs_unbuffered_write_iter_locked);
/**
* netfs_unbuffered_write_iter - Unbuffered write to a file
* @iocb: IO state structure
* @from: iov_iter with data to write
*
* Do an unbuffered write to a file, writing the data directly to the server
* and not lodging the data in the pagecache.
*
* Return:
* * Negative error code if no data has been written at all of
* vfs_fsync_range() failed for a synchronous write
* * Number of bytes written, even for truncated writes
*/
ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
struct netfs_inode *ictx = netfs_inode(inode);
ssize_t ret;
loff_t pos = iocb->ki_pos;
unsigned long long end = pos + iov_iter_count(from) - 1;
_enter("%llx,%zx,%llx", pos, iov_iter_count(from), i_size_read(inode));
if (!iov_iter_count(from))
return 0;
trace_netfs_write_iter(iocb, from);
netfs_stat(&netfs_n_wh_dio_write);
ret = netfs_start_io_direct(inode);
if (ret < 0)
return ret;
ret = generic_write_checks(iocb, from);
if (ret <= 0)
goto out;
ret = file_remove_privs(file);
if (ret < 0)
goto out;
ret = file_update_time(file);
if (ret < 0)
goto out;
if (iocb->ki_flags & IOCB_NOWAIT) {
/* We could block if there are any pages in the range. */
ret = -EAGAIN;
if (filemap_range_has_page(mapping, pos, end))
if (filemap_invalidate_inode(inode, true, pos, end))
goto out;
} else {
ret = filemap_write_and_wait_range(mapping, pos, end);
if (ret < 0)
goto out;
}
/*
* After a write we want buffered reads to be sure to go to disk to get
* the new data. We invalidate clean cached page from the region we're
* about to write. We do this *before* the write so that we can return
* without clobbering -EIOCBQUEUED from ->direct_IO().
*/
ret = filemap_invalidate_inode(inode, true, pos, end);
if (ret < 0)
goto out;
end = iocb->ki_pos + iov_iter_count(from);
if (end > ictx->zero_point)
ictx->zero_point = end;
fscache_invalidate(netfs_i_cookie(ictx), NULL, i_size_read(inode),
FSCACHE_INVAL_DIO_WRITE);
ret = netfs_unbuffered_write_iter_locked(iocb, from, NULL);
out:
netfs_end_io_direct(inode);
return ret;
}
EXPORT_SYMBOL(netfs_unbuffered_write_iter);