fs/netfs/buffered_read.c - linux/kernel/git/dhowells/linux-fs - Git at Google

 // SPDX-License-Identifier: GPL-2.0-or-later
 /* Network filesystem high-level buffered read support.
  *
  * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  */

 #include <linux/export.h>
 #include <linux/task_io_accounting_ops.h>
 #include "internal.h"

 struct netfs_buffered_read_context {
 	struct netfs_read_context r;
 	struct fscache_occupancy cache;		/* List of cached extents */
 	unsigned long long	i_size;		/* Size of file */
 	size_t			buffered;	/* Amount in buffer */
 	struct readahead_control *ractl;	/* Readahead source buffer */
 	struct bvecq_pos	dispatch_cursor; /* Cursor from which we dispatch ops */
 };

 static void netfs_cache_expand_readahead(struct netfs_io_request *rreq,
 					 unsigned long long *_start,
 					 unsigned long long *_len,
 					 unsigned long long i_size)
 {
 	struct netfs_cache_resources *cres = &rreq->cache_resources;

 	if (cres->ops && cres->ops->expand_readahead)
 		cres->ops->expand_readahead(cres, _start, _len, i_size);
 }

 static void netfs_rreq_expand(struct netfs_io_request *rreq,
 			      struct readahead_control *ractl)
 {
 	/* Give the cache a chance to change the request parameters.  The
 	 * resultant request must contain the original region.
 	 */
 	netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size);

 	/* Give the netfs a chance to change the request parameters.  The
 	 * resultant request must contain the original region.
 	 */
 	if (rreq->netfs_ops->expand_readahead)
 		rreq->netfs_ops->expand_readahead(rreq);

 	/* Expand the request if the cache wants it to start earlier.  Note
 	 * that the expansion may get further extended if the VM wishes to
 	 * insert THPs and the preferred start and/or end wind up in the middle
 	 * of THPs.
 	 *
 	 * If this is the case, however, the THP size should be an integer
 	 * multiple of the cache granule size, so we get a whole number of
 	 * granules to deal with.
 	 */
 	if (rreq->start  != readahead_pos(ractl) ||
 	    rreq->len != readahead_length(ractl)) {
 		readahead_expand(ractl, rreq->start, rreq->len);
 		rreq->start  = readahead_pos(ractl);
 		rreq->len = readahead_length(ractl);

 		trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
 				 netfs_read_trace_expanded);
 	}
 }

 /*
  * Clear any remaining pages in the readahead request.
  */
 static void netfs_clear_to_ra_end(struct netfs_io_request *rreq,
 				  struct netfs_buffered_read_context *rctx)
 {
 	struct folio_batch batch;

 	folio_batch_init(&batch);

 	for (;;) {
 		batch.nr = __readahead_batch(rctx->ractl, (struct page **)batch.folios,
 					     PAGEVEC_SIZE);
 		if (!batch.nr)
 			break;
 		for (int i = 0; i < batch.nr; i++) {
 			struct folio *folio = batch.folios[i];

 			trace_netfs_folio(folio, netfs_folio_trace_zero_ra);
 			folio_zero_segment(folio, 0, folio_size(folio));
 		}
 		folio_batch_release(&batch);
 	}
 }

 /*
  * Begin an operation, and fetch the stored zero point value from the cookie if
  * available.
  */
 static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_inode *ctx)
 {
 	return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx));
 }

 /*
  * Prepare the I/O buffer on a buffered read subrequest for the filesystem to
  * use as a bvec queue.
  *
  * [!] NOTE: This must be run in the same thread as ->issue_read() was called
  * in as we access the readahead_control struct.
  */
 static int netfs_prepare_buffered_read_buffer(struct netfs_io_subrequest *subreq,
 					      struct netfs_read_context *base_rctx,
 					      unsigned int max_segs)
 {
 	struct netfs_buffered_read_context *rctx =
 		container_of(base_rctx, struct netfs_buffered_read_context, r);
 	struct netfs_io_request *rreq = subreq->rreq;
 	ssize_t extracted;

 	_enter("R=%08x[%x] l=%zx s=%u",
 	       rreq->debug_id, subreq->debug_index, subreq->len, max_segs);

 	if (rctx->ractl) {
 		/* If we don't have sufficient folios in the rolling buffer,
 		 * extract a bvecq's worth from the readahead region at a time
 		 * into the buffer.  Note that this acquires a ref on each page
 		 * that we will need to release later - but we don't want to do
 		 * that until after we've started the I/O.
 		 */
 		struct folio_batch put_batch;

 		_debug("ractl %zx < %zx", rctx->buffered, subreq->len);

 		folio_batch_init(&put_batch);
 		while (rctx->buffered < subreq->len) {
 			ssize_t added;

 			added = bvecq_load_from_ra(&rreq->load_cursor, rctx->ractl,
 						   &put_batch);
 			if (added < 0)
 				return added;
 			rctx->buffered += added;
 		}
 		folio_batch_release(&put_batch);
 	}

 	bvecq_pos_attach(&subreq->dispatch_pos, &rctx->dispatch_cursor);
 	bvecq_pos_attach(&subreq->content, &subreq->dispatch_pos);
 	extracted = bvecq_slice(&rctx->dispatch_cursor, subreq->len,
 				max_segs, &subreq->nr_segs);
 	if (extracted < 0)
 		return extracted;

 	rctx->buffered -= extracted;
 	if (extracted < subreq->len) {
 		subreq->len = extracted;
 		trace_netfs_sreq(subreq, netfs_sreq_trace_limited);
 	}

 	return 0;
 }

 /**
  * netfs_prepare_read_buffer - Get the buffer for a subrequest
  * @subreq: The subrequest to get the buffer for
  * @rctx: Read context
  * @max_segs: Maximum number of segments in buffer (or INT_MAX)
  *
  * Extract a slice of buffer from the stream and attach it to the subrequest as
  * a bio_vec queue.  The maximum amount of data attached is set by
  * @subreq->len, but this may be shortened if @max_segs would be exceeded.
  *
  * [!] NOTE: This must be run in the same thread as ->issue_read() was called
  * in as we access the readahead_control struct if there is one.
  */
 int netfs_prepare_read_buffer(struct netfs_io_subrequest *subreq,
 			      struct netfs_read_context *rctx,
 			      unsigned int max_segs)
 {
 	switch (subreq->rreq->origin) {
 	case NETFS_READAHEAD:
 	case NETFS_READPAGE:
 	case NETFS_READ_FOR_WRITE:
 		if (rctx->retrying)
 			return netfs_prepare_buffered_read_retry_buffer(subreq, rctx, max_segs);
 		return netfs_prepare_buffered_read_buffer(subreq, rctx, max_segs);

 	case NETFS_UNBUFFERED_READ:
 	case NETFS_DIO_READ:
 	case NETFS_READ_GAPS:
 		return netfs_prepare_unbuffered_read_buffer(subreq, rctx, max_segs);
 	case NETFS_READ_SINGLE:
 		return netfs_prepare_read_single_buffer(subreq, rctx, max_segs);
 	default:
 		WARN_ON_ONCE(1);
 		return -EIO;
 	}
 }
 EXPORT_SYMBOL(netfs_prepare_read_buffer);

 int netfs_read_query_cache(struct netfs_io_request *rreq,
 			   struct fscache_occupancy *occ)
 {
 	struct netfs_cache_resources *cres = &rreq->cache_resources;

 	occ->granularity = PAGE_SIZE;
 	occ->no_more_cache = true;
 	if (occ->query_from >= occ->query_to)
 		return 0;
 	if (!cres->ops)
 		return 0;
 	occ->query_from = round_up(occ->query_from, occ->granularity);
 	return cres->ops->query_occupancy(cres, occ);
 }

 /**
  * netfs_mark_read_submission - Mark a read subrequest as being ready for submission
  * @subreq: The subrequest to be marked
  * @rctx: Read context supplied to ->issue_read()
  *
  * Calling this marks a read subrequest as being ready for submission and makes
  * it available to the collection thread.  After calling this, the filesystem's
  * ->issue_read() method must invoke netfs_read_subreq_terminated() to end the
  * subrequest and must return -EIOCBQUEUED.
  */
 void netfs_mark_read_submission(struct netfs_io_subrequest *subreq,
 				struct netfs_read_context *rctx)
 {
 	struct netfs_io_request *rreq = subreq->rreq;
 	struct netfs_io_stream *stream = &rreq->io_streams[0];

 	_enter("R=%08x[%x]", rreq->debug_id, subreq->debug_index);

 	__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);

 	/* We add to the end of the list whilst the collector may be walking
 	 * the list.  The collector only goes nextwards and uses the lock to
 	 * remove entries off of the front.
 	 */
 	spin_lock(&rreq->lock);
 	if (list_empty(&subreq->rreq_link)) {
 		list_add_tail(&subreq->rreq_link, &stream->subrequests);
 		if (list_is_first(&subreq->rreq_link, &stream->subrequests)) {
 			stream->front = subreq;
 			if (!stream->active) {
 				stream->collected_to = stream->front->start;
 				/* Store list pointers before active flag */
 				smp_store_release(&stream->active, true);
 			}
 		}
 	}

 	rreq->submitted += subreq->len;
 	rctx->start = subreq->start + subreq->len;
 	if (rctx->start >= rctx->stop) {
 		smp_wmb(); /* Write lists before ALL_QUEUED. */
 		set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
 		trace_netfs_rreq(rreq, netfs_rreq_trace_all_queued);
 	}

 	spin_unlock(&rreq->lock);

 	trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
 }
 EXPORT_SYMBOL(netfs_mark_read_submission);

 static int netfs_issue_read(struct netfs_io_request *rreq,
 			    struct netfs_io_subrequest *subreq,
 			    struct netfs_buffered_read_context *rctx)
 {
 	_enter("R=%08x[%x]", rreq->debug_id, subreq->debug_index);

 	switch (subreq->source) {
 	case NETFS_DOWNLOAD_FROM_SERVER:
 		return rreq->netfs_ops->issue_read(subreq, &rctx->r);
 	case NETFS_READ_FROM_CACHE: {
 		struct netfs_cache_resources *cres = &rreq->cache_resources;

 		netfs_stat(&netfs_n_rh_read);
 		cres->ops->issue_read(subreq, &rctx->r);
 		return -EIOCBQUEUED;
 	}
 	default:
 		netfs_mark_read_submission(subreq, &rctx->r);
 		bvecq_zero(&rctx->dispatch_cursor, subreq->len);
 		subreq->transferred = subreq->len;
 		subreq->error = 0;
 		netfs_read_subreq_terminated(subreq);
 		if (rctx->ractl)
 			netfs_clear_to_ra_end(rreq, rctx);
 		return 0;
 	}
 }

 /*
  * Perform a read to the pagecache from a series of sources of different types,
  * slicing up the region to be read according to available cache blocks and
  * network rsize.
  */
 static void netfs_read_to_pagecache(struct netfs_io_request *rreq,
 				    struct readahead_control *ractl)
 {
 	struct netfs_buffered_read_context rctx = {
 		.cache.query_from	= rreq->start,
 		.cache.query_to		= rreq->start + rreq->len,
 		.cache.cached_from[0]	= ULLONG_MAX,
 		.cache.cached_to[0]	= ULLONG_MAX,
 		.r.start		= rreq->start,
 		.r.stop			= rreq->start + rreq->len,
 		.i_size			= rreq->i_size,
 		.ractl			= ractl,
 	};
 	struct netfs_inode *ictx = netfs_inode(rreq->inode);
 	int ret = 0;

 	_enter("R=%08x", rreq->debug_id);

 	bvecq_pos_attach(&rctx.dispatch_cursor, &rreq->load_cursor);
 	bvecq_pos_attach(&rreq->collect_cursor, &rctx.dispatch_cursor);


 	do {
 		struct netfs_io_subrequest *subreq;
 		struct fscache_occupancy *occ = &rctx.cache;
 		unsigned long long hole_to = ULLONG_MAX, cache_to = ULLONG_MAX;

 		/* If we don't have any, find out the next couple of data
 		 * extents from the cache, containing of following the
 		 * specified start offset.  Holes have to be fetched from the
 		 * server; data regions from the cache.
 		 */
 		if (!occ->no_more_cache) {
 			if (!occ->nr_extents) {
 				ret = netfs_read_query_cache(rreq, &rctx.cache);
 				if (ret < 0)
 					break;
 				if (occ->no_more_cache) {
 					occ->cached_from[0] = ULLONG_MAX;
 					occ->cached_to[0] = ULLONG_MAX;
 					occ->nr_extents = 0;
 				}
 			}

 			/* Shuffle down the extent list to evict used-up or
 			 * useless extents.
 			 */
 			if (occ->nr_extents) {
 				hole_to  = round_up(occ->cached_from[0], occ->granularity);
 				cache_to = round_down(occ->cached_to[0], occ->granularity);
 				if (hole_to > cache_to) {
 					occ->cached_to[0] = rctx.r.start;
 				} else {
 					occ->cached_from[0] = hole_to;
 					occ->cached_to[0] = cache_to;
 				}

 				if (rctx.r.start >= occ->cached_to[0]) {
 					for (int i = 1; i < occ->nr_extents; i++) {
 						occ->cached_from[i - 1] = occ->cached_from[i];
 						occ->cached_to[i - 1]   = occ->cached_to[i];
 						occ->cached_type[i - 1] = occ->cached_type[i];
 					}
 					occ->nr_extents--;
 					continue;
 				}
 			}
 		}

 		subreq = netfs_alloc_subrequest(rreq);
 		if (!subreq) {
 			ret = -ENOMEM;
 			break;
 		}

 		subreq->start = rctx.r.start;

 		hole_to  = occ->cached_from[0];
 		cache_to = occ->cached_to[0];

 		_debug("rsub %llx %llx-%llx", subreq->start, hole_to, cache_to);

 		if (occ->nr_extents &&
 		    rctx.r.start >= hole_to && rctx.r.start < cache_to) {
 			/* Overlap with a cached region, where the cache may
 			 * record a block of zeroes.
 			 */
 			_debug("cached");
 			subreq->len = cache_to - rctx.r.start;
 			if (occ->cached_type[0] == FSCACHE_EXTENT_ZERO) {
 				subreq->source = NETFS_FILL_WITH_ZEROES;
 				netfs_stat(&netfs_n_rh_zero);
 			} else {
 				subreq->source = NETFS_READ_FROM_CACHE;
 			}
 		} else if (subreq->start >= ictx->zero_point &&
 			   subreq->start < rctx.r.stop) {
 			/* If this range lies beyond the zero-point, that part
 			 * can just be cleared locally.
 			 */
 			_debug("zero %llx-%llx", rctx.r.start, rctx.r.stop);
 			subreq->len = rctx.r.stop - rctx.r.start;
 			subreq->source = NETFS_FILL_WITH_ZEROES;
 			netfs_stat(&netfs_n_rh_zero);
 		} else {
 			/* Read a cache hole from the server.  If any part of
 			 * this range lies beyond the zero-point or the EOF,
 			 * that part can just be cleared locally.
 			 */
 			unsigned long long zlimit = umin(rctx.i_size, ictx->zero_point);
 			unsigned long long limit = min3(zlimit, rctx.r.stop, hole_to);

 			_debug("limit %llx %llx", rctx.i_size, ictx->zero_point);
 			_debug("download %llx-%llx", rctx.r.start, rctx.r.stop);
 			subreq->len = umin(limit - subreq->start, ULONG_MAX);
 			subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
 			if (rreq->cache_resources.ops)
 				__set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
 			netfs_stat(&netfs_n_rh_download);
 		}

 		if (subreq->len == 0) {
 			pr_err("ZERO-LEN READ: R=%08x[%x] l=%zx/%llx s=%llx z=%llx i=%llx",
 			       rreq->debug_id, subreq->debug_index,
 			       subreq->len, rctx.r.stop - subreq->start,
 			       subreq->start, ictx->zero_point, rreq->i_size);
 			break;
 		}

 		ret = netfs_issue_read(rreq, subreq, &rctx);
 		if (ret != 0 && ret != -EIOCBQUEUED) {
 			subreq->error = ret;
 			trace_netfs_sreq(subreq, netfs_sreq_trace_cancel);
 			/* Not queued - release both refs. */
 			netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel);
 			netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel);
 			break;
 		}
 		ret = 0;

 		cond_resched();
 	} while (rctx.r.start < rctx.r.stop);

 	if (unlikely(rctx.r.start < rctx.r.stop)) {
 		smp_wmb(); /* Write lists before ALL_QUEUED. */
 		set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
 		netfs_wake_collector(rreq);
 	}

 	/* Defer error return as we may need to wait for outstanding I/O. */
 	cmpxchg(&rreq->error, 0, ret);

 	bvecq_pos_detach(&rreq->load_cursor);
 	bvecq_pos_detach(&rctx.dispatch_cursor);
 }

 /**
  * netfs_readahead - Helper to manage a read request
  * @ractl: The description of the readahead request
  *
  * Fulfil a readahead request by drawing data from the cache if possible, or
  * the netfs if not.  Space beyond the EOF is zero-filled.  Multiple I/O
  * requests from different sources will get munged together.  If necessary, the
  * readahead window can be expanded in either direction to a more convenient
  * alighment for RPC efficiency or to make storage in the cache feasible.
  *
  * The calling netfs must initialise a netfs context contiguous to the vfs
  * inode before calling this.
  *
  * This is usable whether or not caching is enabled.
  */
 void netfs_readahead(struct readahead_control *ractl)
 {
 	struct netfs_io_request *rreq;
 	struct netfs_inode *ictx = netfs_inode(ractl->mapping->host);
 	unsigned long long start = readahead_pos(ractl);
 	size_t size = readahead_length(ractl);
 	int ret;

 	_enter("");

 	rreq = netfs_alloc_request(ractl->mapping, ractl->file, start, size,
 				   NETFS_READAHEAD);
 	if (IS_ERR(rreq))
 		return;

 	__set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags);

 	ret = netfs_begin_cache_read(rreq, ictx);
 	if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
 		goto cleanup_free;

 	netfs_stat(&netfs_n_rh_readahead);
 	trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
 			 netfs_read_trace_readahead);

 	netfs_rreq_expand(rreq, ractl);

 	rreq->submitted = rreq->start;
 	if (bvecq_buffer_init(&rreq->load_cursor, rreq->debug_id) < 0)
 		goto cleanup_free;
 	netfs_read_to_pagecache(rreq, ractl);

 	return netfs_put_request(rreq, netfs_rreq_trace_put_return);

 cleanup_free:
 	return netfs_put_failed_request(rreq);
 }
 EXPORT_SYMBOL(netfs_readahead);

 /*
  * Create a buffer queue with a single occupying folio.
  */
 static int netfs_create_singular_buffer(struct netfs_io_request *rreq, struct folio *folio)
 {
 	struct bvecq *bq;
 	size_t fsize = folio_size(folio);

 	if (bvecq_buffer_init(&rreq->load_cursor, rreq->debug_id) < 0)
 		return -ENOMEM;

 	bq = rreq->load_cursor.bvecq;
 	bvec_set_folio(&bq->bv[bq->nr_segs++], folio, fsize, 0);
 	rreq->submitted = rreq->start + fsize;
 	return 0;
 }

 /*
  * Read into gaps in a folio partially filled by a streaming write.
  */
 static int netfs_read_gaps(struct file *file, struct folio *folio)
 {
 	struct netfs_io_request *rreq;
 	struct address_space *mapping = folio->mapping;
 	struct netfs_folio *finfo = netfs_folio_info(folio);
 	struct netfs_inode *ctx = netfs_inode(mapping->host);
 	struct bvecq *bq = NULL;
 	struct page *sink = NULL;
 	unsigned int from = finfo->dirty_offset;
 	unsigned int to = from + finfo->dirty_len;
 	unsigned int off = 0;
 	size_t flen = folio_size(folio);
 	size_t nr_bvec = flen / PAGE_SIZE + 2;
 	size_t part;
 	int ret;

 	_enter("%lx", folio->index);

 	rreq = netfs_alloc_request(mapping, file, folio_pos(folio), flen, NETFS_READ_GAPS);
 	if (IS_ERR(rreq)) {
 		ret = PTR_ERR(rreq);
 		goto alloc_error;
 	}

 	ret = netfs_begin_cache_read(rreq, ctx);
 	if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
 		goto discard;

 	netfs_stat(&netfs_n_rh_read_folio);
 	trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_read_gaps);

 	/* Fiddle the buffer so that a gap at the beginning and/or a gap at the
 	 * end get copied to, but the middle is discarded.
 	 */
 	ret = -ENOMEM;
 	bq = netfs_alloc_bvecq(nr_bvec, GFP_KERNEL);
 	if (!bq)
 		goto discard;
 	rreq->load_cursor.bvecq = bq;

 	sink = alloc_page(GFP_KERNEL);
 	if (!sink)
 		goto discard;

 	trace_netfs_folio(folio, netfs_folio_trace_read_gaps);

 	for (struct bvecq *p = bq; p; p = p->next)
 		p->free = true;

 	if (from > 0) {
 		folio_get(folio);
 		bvec_set_folio(&bq->bv[bq->nr_segs++], folio, from, 0);
 		off = from;
 	}
 	while (off < to) {
 		if (bvecq_is_full(bq))
 			bq = bq->next;
 		part = umin(to - off, PAGE_SIZE);
 		get_page(sink);
 		bvec_set_page(&bq->bv[bq->nr_segs++], sink, part, 0);
 		off += part;
 	}
 	if (to < flen) {
 		if (bvecq_is_full(bq))
 			bq = bq->next;
 		folio_get(folio);
 		bvec_set_folio(&bq->bv[bq->nr_segs++], folio, flen - to, to);
 	}

 	dump_bvecq(bq);

 	rreq->submitted = rreq->start + flen;

 	netfs_read_to_pagecache(rreq, NULL);

 	put_page(sink);

 	ret = netfs_wait_for_read(rreq);
 	if (ret >= 0) {
 		flush_dcache_folio(folio);
 		folio_mark_uptodate(folio);
 	}
 	folio_unlock(folio);
 	netfs_put_request(rreq, netfs_rreq_trace_put_return);
 	return ret < 0 ? ret : 0;

 discard:
 	if (sink)
 		put_page(sink);
 	netfs_put_failed_request(rreq);
 alloc_error:
 	folio_unlock(folio);
 	return ret;
 }

 /**
  * netfs_read_folio - Helper to manage a read_folio request
  * @file: The file to read from
  * @folio: The folio to read
  *
  * Fulfil a read_folio request by drawing data from the cache if
  * possible, or the netfs if not.  Space beyond the EOF is zero-filled.
  * Multiple I/O requests from different sources will get munged together.
  *
  * The calling netfs must initialise a netfs context contiguous to the vfs
  * inode before calling this.
  *
  * This is usable whether or not caching is enabled.
  */
 int netfs_read_folio(struct file *file, struct folio *folio)
 {
 	struct address_space *mapping = folio->mapping;
 	struct netfs_io_request *rreq;
 	struct netfs_inode *ctx = netfs_inode(mapping->host);
 	int ret;

 	if (folio_test_dirty(folio)) {
 		trace_netfs_folio(folio, netfs_folio_trace_read_gaps);
 		return netfs_read_gaps(file, folio);
 	}

 	_enter("%lx", folio->index);

 	rreq = netfs_alloc_request(mapping, file,
 				   folio_pos(folio), folio_size(folio),
 				   NETFS_READPAGE);
 	if (IS_ERR(rreq)) {
 		ret = PTR_ERR(rreq);
 		goto alloc_error;
 	}

 	ret = netfs_begin_cache_read(rreq, ctx);
 	if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
 		goto discard;

 	netfs_stat(&netfs_n_rh_read_folio);
 	trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);

 	/* Set up the output buffer */
 	ret = netfs_create_singular_buffer(rreq, folio);
 	if (ret < 0)
 		goto discard;

 	netfs_read_to_pagecache(rreq, NULL);
 	ret = netfs_wait_for_read(rreq);
 	netfs_put_request(rreq, netfs_rreq_trace_put_return);
 	return ret < 0 ? ret : 0;

 discard:
 	netfs_put_failed_request(rreq);
 alloc_error:
 	folio_unlock(folio);
 	return ret;
 }
 EXPORT_SYMBOL(netfs_read_folio);

 /*
  * Prepare a folio for writing without reading first
  * @folio: The folio being prepared
  * @pos: starting position for the write
  * @len: length of write
  * @always_fill: T if the folio should always be completely filled/cleared
  *
  * In some cases, write_begin doesn't need to read at all:
  * - full folio write
  * - write that lies in a folio that is completely beyond EOF
  * - write that covers the folio from start to EOF or beyond it
  *
  * If any of these criteria are met, then zero out the unwritten parts
  * of the folio and return true. Otherwise, return false.
  */
 static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len,
 				 bool always_fill)
 {
 	struct inode *inode = folio_inode(folio);
 	loff_t i_size = i_size_read(inode);
 	size_t offset = offset_in_folio(folio, pos);
 	size_t plen = folio_size(folio);

 	if (unlikely(always_fill)) {
 		if (pos - offset + len <= i_size)
 			return false; /* Page entirely before EOF */
 		folio_zero_segment(folio, 0, plen);
 		folio_mark_uptodate(folio);
 		return true;
 	}

 	/* Full folio write */
 	if (offset == 0 && len >= plen)
 		return true;

 	/* Page entirely beyond the end of the file */
 	if (pos - offset >= i_size)
 		goto zero_out;

 	/* Write that covers from the start of the folio to EOF or beyond */
 	if (offset == 0 && (pos + len) >= i_size)
 		goto zero_out;

 	return false;
 zero_out:
 	folio_zero_segments(folio, 0, offset, offset + len, plen);
 	return true;
 }

 /**
  * netfs_write_begin - Helper to prepare for writing [DEPRECATED]
  * @ctx: The netfs context
  * @file: The file to read from
  * @mapping: The mapping to read from
  * @pos: File position at which the write will begin
  * @len: The length of the write (may extend beyond the end of the folio chosen)
  * @_folio: Where to put the resultant folio
  * @_fsdata: Place for the netfs to store a cookie
  *
  * Pre-read data for a write-begin request by drawing data from the cache if
  * possible, or the netfs if not.  Space beyond the EOF is zero-filled.
  * Multiple I/O requests from different sources will get munged together.
  *
  * The calling netfs must provide a table of operations, only one of which,
  * issue_read, is mandatory.
  *
  * The check_write_begin() operation can be provided to check for and flush
  * conflicting writes once the folio is grabbed and locked.  It is passed a
  * pointer to the fsdata cookie that gets returned to the VM to be passed to
  * write_end.  It is permitted to sleep.  It should return 0 if the request
  * should go ahead or it may return an error.  It may also unlock and put the
  * folio, provided it sets ``*foliop`` to NULL, in which case a return of 0
  * will cause the folio to be re-got and the process to be retried.
  *
  * The calling netfs must initialise a netfs context contiguous to the vfs
  * inode before calling this.
  *
  * This is usable whether or not caching is enabled.
  *
  * Note that this should be considered deprecated and netfs_perform_write()
  * used instead.
  */
 int netfs_write_begin(struct netfs_inode *ctx,
 		      struct file *file, struct address_space *mapping,
 		      loff_t pos, unsigned int len, struct folio **_folio,
 		      void **_fsdata)
 {
 	struct netfs_io_request *rreq;
 	struct folio *folio;
 	pgoff_t index = pos >> PAGE_SHIFT;
 	int ret;

 retry:
 	folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
 				    mapping_gfp_mask(mapping));
 	if (IS_ERR(folio))
 		return PTR_ERR(folio);

 	if (ctx->ops->check_write_begin) {
 		/* Allow the netfs (eg. ceph) to flush conflicts. */
 		ret = ctx->ops->check_write_begin(file, pos, len, &folio, _fsdata);
 		if (ret < 0) {
 			trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin);
 			goto error;
 		}
 		if (!folio)
 			goto retry;
 	}

 	if (folio_test_uptodate(folio))
 		goto have_folio;

 	/* If the folio is beyond the EOF, we want to clear it - unless it's
 	 * within the cache granule containing the EOF, in which case we need
 	 * to preload the granule.
 	 */
 	if (!netfs_is_cache_enabled(ctx) &&
 	    netfs_skip_folio_read(folio, pos, len, false)) {
 		netfs_stat(&netfs_n_rh_write_zskip);
 		goto have_folio_no_wait;
 	}

 	rreq = netfs_alloc_request(mapping, file,
 				   folio_pos(folio), folio_size(folio),
 				   NETFS_READ_FOR_WRITE);
 	if (IS_ERR(rreq)) {
 		ret = PTR_ERR(rreq);
 		goto error;
 	}
 	rreq->no_unlock_folio	= folio->index;
 	__set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);

 	ret = netfs_begin_cache_read(rreq, ctx);
 	if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
 		goto error_put;

 	netfs_stat(&netfs_n_rh_write_begin);
 	trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);

 	/* Set up the output buffer */
 	ret = netfs_create_singular_buffer(rreq, folio);
 	if (ret < 0)
 		goto error_put;

 	netfs_read_to_pagecache(rreq, NULL);
 	ret = netfs_wait_for_read(rreq);
 	if (ret < 0)
 		goto error;
 	netfs_put_request(rreq, netfs_rreq_trace_put_return);

 have_folio:
 	ret = folio_wait_private_2_killable(folio);
 	if (ret < 0)
 		goto error;
 have_folio_no_wait:
 	*_folio = folio;
 	_leave(" = 0");
 	return 0;

 error_put:
 	netfs_put_failed_request(rreq);
 error:
 	if (folio) {
 		folio_unlock(folio);
 		folio_put(folio);
 	}
 	_leave(" = %d", ret);
 	return ret;
 }
 EXPORT_SYMBOL(netfs_write_begin);

 /*
  * Preload the data into a folio we're proposing to write into.
  */
 int netfs_prefetch_for_write(struct file *file, struct folio *folio,
 			     size_t offset, size_t len)
 {
 	struct netfs_io_request *rreq;
 	struct address_space *mapping = folio->mapping;
 	struct netfs_inode *ctx = netfs_inode(mapping->host);
 	unsigned long long start = folio_pos(folio);
 	size_t flen = folio_size(folio);
 	int ret;

 	_enter("%zx @%llx", flen, start);

 	ret = -ENOMEM;

 	rreq = netfs_alloc_request(mapping, file, start, flen,
 				   NETFS_READ_FOR_WRITE);
 	if (IS_ERR(rreq)) {
 		ret = PTR_ERR(rreq);
 		goto error;
 	}

 	rreq->no_unlock_folio = folio->index;
 	__set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
 	ret = netfs_begin_cache_read(rreq, ctx);
 	if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
 		goto error_put;

 	netfs_stat(&netfs_n_rh_write_begin);
 	trace_netfs_read(rreq, start, flen, netfs_read_trace_prefetch_for_write);

 	/* Set up the output buffer */
 	ret = netfs_create_singular_buffer(rreq, folio);
 	if (ret < 0)
 		goto error_put;
 	rreq->load_cursor.bvecq->free = true;

 	netfs_read_to_pagecache(rreq, NULL);
 	ret = netfs_wait_for_read(rreq);
 	netfs_put_request(rreq, netfs_rreq_trace_put_return);
 	return ret < 0 ? ret : 0;

 error_put:
 	netfs_put_failed_request(rreq);
 error:
 	_leave(" = %d", ret);
 	return ret;
 }

 /**
  * netfs_buffered_read_iter - Filesystem buffered I/O read routine
  * @iocb: kernel I/O control block
  * @iter: destination for the data read
  *
  * This is the ->read_iter() routine for all filesystems that can use the page
  * cache directly.
  *
  * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be
  * returned when no data can be read without waiting for I/O requests to
  * complete; it doesn't prevent readahead.
  *
  * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests
  * shall be made for the read or for readahead.  When no data can be read,
  * -EAGAIN shall be returned.  When readahead would be triggered, a partial,
  * possibly empty read shall be returned.
  *
  * Return:
  * * number of bytes copied, even for partial reads
  * * negative error code (or 0 if IOCB_NOIO) if nothing was read
  */
 ssize_t netfs_buffered_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
 	struct netfs_inode *ictx = netfs_inode(inode);
 	ssize_t ret;

 	if (WARN_ON_ONCE((iocb->ki_flags & IOCB_DIRECT) ||
 			 test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)))
 		return -EINVAL;

 	ret = netfs_start_io_read(inode);
 	if (ret == 0) {
 		ret = filemap_read(iocb, iter, 0);
 		netfs_end_io_read(inode);
 	}
 	return ret;
 }
 EXPORT_SYMBOL(netfs_buffered_read_iter);

 /**
  * netfs_file_read_iter - Generic filesystem read routine
  * @iocb: kernel I/O control block
  * @iter: destination for the data read
  *
  * This is the ->read_iter() routine for all filesystems that can use the page
  * cache directly.
  *
  * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be
  * returned when no data can be read without waiting for I/O requests to
  * complete; it doesn't prevent readahead.
  *
  * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests
  * shall be made for the read or for readahead.  When no data can be read,
  * -EAGAIN shall be returned.  When readahead would be triggered, a partial,
  * possibly empty read shall be returned.
  *
  * Return:
  * * number of bytes copied, even for partial reads
  * * negative error code (or 0 if IOCB_NOIO) if nothing was read
  */
 ssize_t netfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct netfs_inode *ictx = netfs_inode(iocb->ki_filp->f_mapping->host);

 	if ((iocb->ki_flags & IOCB_DIRECT) ||
 	    test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags))
 		return netfs_unbuffered_read_iter(iocb, iter);

 	return netfs_buffered_read_iter(iocb, iter);
 }
 EXPORT_SYMBOL(netfs_file_read_iter);