| // SPDX-License-Identifier: GPL-2.0-only |
| /* Network filesystem high-level write support. |
| * |
| * Copyright (C) 2022 Red Hat, Inc. All Rights Reserved. |
| * Written by David Howells (dhowells@redhat.com) |
| */ |
| |
| #include <linux/export.h> |
| #include <linux/fs.h> |
| #include <linux/mm.h> |
| #include <linux/pagemap.h> |
| #include <linux/slab.h> |
| #include <linux/backing-dev.h> |
| #include "internal.h" |
| |
| /* Amount of write credit available */ |
| atomic_long_t netfs_write_credit = ATOMIC_LONG_INIT(320 * 1024 * 1024); |
| |
| static size_t copy_folio_from_iter_atomic(struct folio *folio, |
| unsigned int offset, size_t size, |
| struct iov_iter *i) |
| { |
| size_t copied = 0, n; |
| |
| do { |
| unsigned int index = offset / PAGE_SIZE; |
| unsigned int poffset = offset % PAGE_SIZE; |
| unsigned int psize = min(PAGE_SIZE - offset, size); |
| |
| n = copy_page_from_iter_atomic(folio_file_page(folio, index), |
| poffset, psize, i); |
| copied += n; |
| if (n < psize) |
| break; |
| size -= n; |
| } while (size); |
| return copied; |
| } |
| |
| atomic_t netfs_region_debug_ids; |
| |
| static bool __overlaps(unsigned long long start1, unsigned long long end1, |
| unsigned long long start2, unsigned long long end2) |
| { |
| return (start1 < start2) ? end1 > start2 : end2 > start1; |
| } |
| |
| static bool overlaps(struct netfs_range *a, struct netfs_range *b) |
| { |
| return __overlaps(a->start, a->end, b->start, b->end); |
| } |
| |
| static int wait_on_region(struct netfs_dirty_region *region, |
| enum netfs_region_state state) |
| { |
| return wait_var_event_interruptible(®ion->state, |
| READ_ONCE(region->state) >= state); |
| } |
| |
| /* |
| * Grab a folio for writing. We don't lock it at this point as we have yet to |
| * preemptively trigger a fault-in - but we need to know how large the folio |
| * will be before we try that. |
| */ |
| static struct folio *netfs_grab_folio_for_write(struct address_space *mapping, |
| loff_t pos, size_t len_remaining) |
| { |
| return __filemap_get_folio(mapping, pos >> PAGE_SHIFT, |
| FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE, |
| mapping_gfp_mask(mapping)); |
| } |
| |
| /* |
| * Initialise a new dirty folio group. The caller is responsible for setting |
| * the type and any flags that they want. |
| */ |
| static void netfs_init_dirty_region(struct netfs_dirty_region *region, |
| struct inode *inode, struct file *file, |
| enum netfs_region_type type, |
| unsigned long flags, |
| struct netfs_range *proposal) |
| { |
| struct netfs_flush_group *group; |
| struct netfs_i_context *ctx = netfs_i_context(inode); |
| unsigned int bsize = min_t(unsigned int, 1U << ctx->min_bshift, PAGE_SIZE); |
| |
| region->state = NETFS_REGION_IS_PENDING; |
| region->type = type; |
| region->flags = flags; |
| region->dirty.start = proposal->start; |
| region->dirty.end = proposal->start; |
| region->bounds.start = round_down(proposal->start, bsize); |
| region->bounds.end = round_up (proposal->end, bsize); |
| region->will_modify_to = proposal->end; |
| region->debug_id = atomic_inc_return(&netfs_region_debug_ids); |
| INIT_LIST_HEAD(®ion->dirty_link); |
| INIT_LIST_HEAD(®ion->flush_link); |
| refcount_set(®ion->ref, 2); |
| spin_lock_init(®ion->lock); |
| if (type == NETFS_REGION_CACHE_COPY) { |
| region->state = NETFS_REGION_IS_DIRTY; |
| region->dirty.end = proposal->end; |
| } |
| |
| if (file && ctx->ops->init_dirty_region) |
| ctx->ops->init_dirty_region(region, file); |
| if (!region->group) { |
| group = list_last_entry(&ctx->flush_groups, |
| struct netfs_flush_group, group_link); |
| region->group = netfs_get_flush_group(group); |
| spin_lock(&ctx->lock); |
| list_add_tail(®ion->flush_link, &group->region_list); |
| spin_unlock(&ctx->lock); |
| } |
| trace_netfs_ref_region(region->debug_id, refcount_read(®ion->ref), |
| netfs_region_trace_new); |
| trace_netfs_dirty(ctx, region, NULL, netfs_dirty_trace_new); |
| netfs_proc_add_region(region); |
| } |
| |
| /* |
| * Queue a region for flushing. Regions may need to be flushed in the right |
| * order (e.g. ceph snaps) and so we may need to chuck other regions onto the |
| * flush queue first. |
| * |
| * The caller must hold ctx->lock. |
| */ |
| void netfs_flush_region(struct netfs_i_context *ctx, |
| struct netfs_dirty_region *region, |
| enum netfs_dirty_trace why) |
| { |
| struct netfs_flush_group *group; |
| |
| LIST_HEAD(flush_queue); |
| |
| _enter("%x", region->debug_id); |
| |
| if (test_and_set_bit(NETFS_REGION_FLUSH_Q, ®ion->flags) || |
| test_bit(NETFS_FGROUP_FLUSHED, ®ion->group->flags)) |
| return; |
| |
| trace_netfs_dirty(ctx, region, NULL, why); |
| |
| /* If the region isn't in the bottom flush group, we need to flush out |
| * all of the flush groups below it. |
| */ |
| if (!test_and_set_bit(NETFS_FGROUP_PARENT_FLUSHED, ®ion->group->flags)) { |
| group = region->group; |
| list_for_each_entry_continue_reverse(group, &ctx->flush_groups, group_link) { |
| set_bit(NETFS_FGROUP_FLUSHED, &group->flags); |
| set_bit(NETFS_FGROUP_PARENT_FLUSHED, &group->flags); |
| } |
| } |
| } |
| |
| /* |
| * Decide if/how a write can be merged with a dirty region. |
| */ |
| static enum netfs_write_compatibility netfs_write_compatibility( |
| struct netfs_i_context *ctx, |
| struct netfs_dirty_region *old, |
| struct netfs_dirty_region *candidate) |
| { |
| /* Regions being actively flushed can't be merged with */ |
| if (old->state >= NETFS_REGION_IS_FLUSHING || |
| candidate->group != old->group || |
| test_bit(NETFS_FGROUP_FLUSHED, &old->group->flags)) { |
| _leave(" = INCOM [flush]"); |
| return NETFS_WRITES_INCOMPATIBLE; |
| } |
| |
| /* The bounding boxes of DSYNC writes can overlap with those of other |
| * DSYNC writes and ordinary writes. DIO writes cannot overlap at all. |
| */ |
| if (candidate->type == NETFS_REGION_DIO || |
| old->type == NETFS_REGION_DIO || |
| old->type == NETFS_REGION_DSYNC) { |
| _leave(" = INCOM [dio/dsy]"); |
| return NETFS_WRITES_INCOMPATIBLE; |
| } |
| |
| /* Pending writes to the cache alone (ie. copy from a read) can be |
| * merged or superseded by a modification that will require writing to |
| * the server too. |
| */ |
| if (old->type == NETFS_REGION_CACHE_COPY) { |
| if (candidate->type == NETFS_REGION_CACHE_COPY) { |
| _leave(" = COMPT [ccopy]"); |
| return NETFS_WRITES_COMPATIBLE; |
| } |
| _leave(" = SUPER [ccopy]"); |
| return NETFS_WRITES_SUPERSEDE; |
| } |
| |
| if (!ctx->ops->is_write_compatible) { |
| if (candidate->type == NETFS_REGION_DSYNC) { |
| _leave(" = SUPER [dsync]"); |
| return NETFS_WRITES_SUPERSEDE; |
| } |
| _leave(" = COMPT"); |
| return NETFS_WRITES_COMPATIBLE; |
| } |
| return ctx->ops->is_write_compatible(ctx, old, candidate); |
| } |
| |
| /* |
| * Split a dirty region. |
| */ |
| static struct netfs_dirty_region *netfs_split_dirty_region( |
| struct netfs_i_context *ctx, |
| struct netfs_dirty_region *region, |
| struct netfs_dirty_region **spare, |
| unsigned long long pos) |
| { |
| struct netfs_dirty_region *tail = *spare; |
| |
| *spare = NULL; |
| *tail = *region; |
| region->dirty.end = pos; |
| tail->dirty.start = pos; |
| tail->debug_id = atomic_inc_return(&netfs_region_debug_ids); |
| |
| refcount_set(&tail->ref, 1); |
| netfs_get_flush_group(tail->group); |
| spin_lock_init(&tail->lock); |
| // TODO: grab cache resources |
| |
| // need to split the bounding box? |
| __set_bit(NETFS_REGION_SUPERSEDED, &tail->flags); |
| if (ctx->ops->split_dirty_region) |
| ctx->ops->split_dirty_region(tail); |
| list_add(&tail->dirty_link, ®ion->dirty_link); |
| list_add(&tail->flush_link, ®ion->flush_link); |
| trace_netfs_dirty(ctx, tail, region, netfs_dirty_trace_split); |
| netfs_proc_add_region(tail); |
| return tail; |
| } |
| |
| /* |
| * Make sure there's a flush group. |
| */ |
| static int netfs_require_flush_group(struct inode *inode) |
| { |
| struct netfs_flush_group *group; |
| struct netfs_i_context *ctx = netfs_i_context(inode); |
| |
| if (list_empty(&ctx->flush_groups)) { |
| kdebug("new flush group"); |
| group = netfs_new_flush_group(inode, NULL); |
| if (!group) |
| return -ENOMEM; |
| } |
| return 0; |
| } |
| |
| /* |
| * Create a dirty region record for the write we're about to do and add it to |
| * the list of regions. We may need to wait for conflicting writes to |
| * complete. |
| */ |
| static struct netfs_dirty_region *netfs_prepare_region(struct inode *inode, |
| struct file *file, |
| struct netfs_range *proposal, |
| enum netfs_region_type type, |
| unsigned long flags) |
| { |
| struct netfs_dirty_region *candidate; |
| int ret; |
| |
| ret = netfs_require_flush_group(inode); |
| if (ret < 0) |
| return ERR_PTR(ret); |
| |
| candidate = netfs_alloc_dirty_region(); |
| if (!candidate) |
| return ERR_PTR(-ENOMEM); |
| |
| netfs_init_dirty_region(candidate, inode, file, type, flags, proposal); |
| return candidate; |
| } |
| |
| /* |
| * Activate a write. This adds it to the dirty list and does any necessary |
| * flushing and superceding there. The caller must provide a spare region |
| * record so that we can split a dirty record if we need to supersede it. |
| */ |
| static void __netfs_activate_write(struct netfs_i_context *ctx, |
| struct netfs_range *proposal, |
| struct netfs_dirty_region *candidate, |
| struct netfs_dirty_region **spare) |
| { |
| struct netfs_dirty_region *r; |
| struct list_head *p; |
| enum netfs_write_compatibility comp; |
| bool conflicts = false; |
| |
| /* See if there are any dirty regions that need flushing first. */ |
| list_for_each(p, &ctx->dirty_regions) { |
| r = list_entry(p, struct netfs_dirty_region, dirty_link); |
| if (r->bounds.end <= candidate->bounds.start) |
| continue; |
| if (r->bounds.start >= candidate->bounds.end) |
| break; |
| |
| if (list_empty(&candidate->dirty_link) && |
| r->dirty.start > candidate->dirty.start) |
| list_add_tail(&candidate->dirty_link, p); |
| |
| comp = netfs_write_compatibility(ctx, r, candidate); |
| switch (comp) { |
| case NETFS_WRITES_INCOMPATIBLE: |
| netfs_flush_region(ctx, r, netfs_dirty_trace_flush_conflict); |
| conflicts = true; |
| continue; |
| |
| case NETFS_WRITES_SUPERSEDE: |
| if (!overlaps(proposal, &r->dirty)) |
| continue; |
| if (r->dirty.start < candidate->dirty.start) { |
| /* The region overlaps the beginning of our |
| * region, we split it and mark the overlapping |
| * part as superseded. We insert ourself |
| * between. |
| */ |
| r = netfs_split_dirty_region(ctx, r, spare, |
| proposal->start); |
| list_add_tail(&candidate->dirty_link, &r->dirty_link); |
| p = &r->dirty_link; /* Advance the for-loop */ |
| } else { |
| /* The region is after ours, so make sure we're |
| * inserted before it. |
| */ |
| if (list_empty(&candidate->dirty_link)) |
| list_add_tail(&candidate->dirty_link, &r->dirty_link); |
| set_bit(NETFS_REGION_SUPERSEDED, &r->flags); |
| trace_netfs_dirty(ctx, candidate, r, netfs_dirty_trace_supersedes); |
| } |
| continue; |
| |
| case NETFS_WRITES_COMPATIBLE: |
| continue; |
| } |
| } |
| |
| if (list_empty(&candidate->dirty_link)) |
| list_add_tail(&candidate->dirty_link, p); |
| |
| if (conflicts) { |
| /* The caller must wait for the flushes to complete. */ |
| trace_netfs_dirty(ctx, candidate, NULL, netfs_dirty_trace_wait_active); |
| kleave(" [wait flush D=%x]", candidate->debug_id); |
| return; |
| } |
| |
| /* Okay, we're cleared to proceed. */ |
| smp_store_release(&candidate->state, NETFS_REGION_IS_ACTIVE); |
| trace_netfs_dirty(ctx, candidate, NULL, netfs_dirty_trace_active); |
| wake_up_var(&candidate->state); |
| _leave(" [go]"); |
| return; |
| } |
| |
| static int netfs_activate_write(struct netfs_i_context *ctx, |
| struct netfs_range *proposal, |
| struct netfs_dirty_region *region) |
| { |
| struct netfs_dirty_region *spare; |
| |
| spare = netfs_alloc_dirty_region(); |
| if (!spare) |
| return -ENOMEM; |
| |
| spin_lock(&ctx->lock); |
| __netfs_activate_write(ctx, proposal, region, &spare); |
| spin_unlock(&ctx->lock); |
| netfs_free_dirty_region(ctx, spare); |
| return 0; |
| } |
| |
| /* |
| * Merge a completed active write into the list of dirty regions. The region |
| * can be in one of a number of states: |
| * |
| * - Ordinary write, error, no data copied. Discard. |
| * - Ordinary write, unflushed. Dirty |
| * - Ordinary write, flush started. Dirty |
| * - Ordinary write, completed/failed. Discard. |
| * - DIO write, completed/failed. Discard. |
| * - DSYNC write, error before flush. As ordinary. |
| * - DSYNC write, flushed in progress, EINTR. Dirty (supersede). |
| * - DSYNC write, written to server and cache. Dirty (supersede)/Discard. |
| * - DSYNC write, written to server but not yet cache. Dirty. |
| * |
| * Once we've dealt with this record, we see about activating some other writes |
| * to fill the activity hole. |
| * |
| * This eats the caller's ref on the region. |
| */ |
| static void netfs_merge_dirty_region(struct netfs_i_context *ctx, |
| struct netfs_dirty_region *region, |
| struct list_head *graveyard) |
| { |
| struct netfs_dirty_region *p, *q, *front; |
| size_t min_bsize = 1UL << ctx->min_bshift; |
| bool new_content = test_bit(NETFS_ICTX_NEW_CONTENT, &ctx->flags); |
| |
| region->dirty.start = round_down(region->dirty.start, min_bsize); |
| region->dirty.end = min_t(unsigned long long, |
| round_up(region->dirty.end, min_bsize), |
| i_size_read(netfs_inode(ctx))); |
| |
| switch (region->type) { |
| case NETFS_REGION_DIO: |
| list_move_tail(®ion->dirty_link, graveyard); |
| return; |
| |
| case NETFS_REGION_DSYNC: |
| /* A DSYNC write may have overwritten some dirty data |
| * and caused the writeback of other dirty data. |
| */ |
| goto scan_forwards; |
| |
| case NETFS_REGION_ORDINARY: |
| if (region->dirty.end == region->dirty.start) { |
| list_move_tail(®ion->dirty_link, graveyard); |
| return; |
| } |
| goto scan_backwards; |
| |
| case NETFS_REGION_CACHE_COPY: |
| goto scan_backwards; |
| } |
| |
| scan_backwards: |
| _debug("scan_backwards"); |
| /* Search backwards for a preceding record that we might be able to |
| * merge with. We skip over any intervening flush-in-progress records. |
| */ |
| p = front = region; |
| list_for_each_entry_continue_reverse(p, &ctx->dirty_regions, dirty_link) { |
| _debug("- back %x", p->debug_id); |
| if (p->state >= NETFS_REGION_IS_FLUSHING) |
| continue; |
| if (p->state == NETFS_REGION_IS_ACTIVE) |
| break; |
| if (p->bounds.end < region->bounds.start) |
| break; |
| if (p->dirty.end >= region->dirty.start || new_content) |
| goto merge_backwards; |
| } |
| goto scan_forwards; |
| |
| merge_backwards: |
| _debug("merge_backwards"); |
| if (test_bit(NETFS_REGION_SUPERSEDED, &p->flags) || |
| netfs_write_compatibility(ctx, p, region) != NETFS_WRITES_COMPATIBLE) |
| goto scan_forwards; |
| |
| front = p; |
| front->bounds.end = max(front->bounds.end, region->bounds.end); |
| front->dirty.end = max(front->dirty.end, region->dirty.end); |
| set_bit(NETFS_REGION_SUPERSEDED, ®ion->flags); |
| list_del_init(®ion->flush_link); |
| trace_netfs_dirty(ctx, front, region, netfs_dirty_trace_merged_back); |
| |
| scan_forwards: |
| /* Subsume forwards any records this one covers. There should be no |
| * non-supersedeable incompatible regions in our range as we would have |
| * flushed and waited for them before permitting this write to start. |
| * |
| * There can, however, be regions undergoing flushing which we need to |
| * skip over and not merge with. |
| */ |
| _debug("scan_forwards"); |
| p = region; |
| list_for_each_entry_safe_continue(p, q, &ctx->dirty_regions, dirty_link) { |
| _debug("- forw %x", p->debug_id); |
| if (p->state >= NETFS_REGION_IS_FLUSHING) |
| continue; |
| if (p->state == NETFS_REGION_IS_ACTIVE) |
| break; |
| if (p->dirty.start > region->dirty.end && |
| (!new_content || p->bounds.start > p->bounds.end)) |
| break; |
| |
| if (region->dirty.end >= p->dirty.end) { |
| /* Entirely subsumed */ |
| list_move_tail(&p->dirty_link, graveyard); |
| list_del_init(&p->flush_link); |
| trace_netfs_dirty(ctx, front, p, netfs_dirty_trace_merged_sub); |
| continue; |
| } |
| |
| goto merge_forwards; |
| } |
| goto merge_complete; |
| |
| merge_forwards: |
| _debug("merge_forwards"); |
| if (test_bit(NETFS_REGION_SUPERSEDED, &p->flags) || |
| netfs_write_compatibility(ctx, p, front) == NETFS_WRITES_SUPERSEDE) { |
| /* If a region was partially superseded by us, we need to roll |
| * it forwards and remove the superseded flag. |
| */ |
| if (p->dirty.start < front->dirty.end) { |
| p->dirty.start = front->dirty.end; |
| clear_bit(NETFS_REGION_SUPERSEDED, &p->flags); |
| } |
| trace_netfs_dirty(ctx, p, front, netfs_dirty_trace_superseded); |
| goto merge_complete; |
| } |
| |
| /* Simply merge overlapping/contiguous ordinary areas together. */ |
| front->bounds.end = max(front->bounds.end, p->bounds.end); |
| front->dirty.end = max(front->dirty.end, p->dirty.end); |
| list_move_tail(&p->dirty_link, graveyard); |
| list_del_init(&p->flush_link); |
| trace_netfs_dirty(ctx, front, p, netfs_dirty_trace_merged_forw); |
| |
| merge_complete: |
| if (test_bit(NETFS_REGION_SUPERSEDED, ®ion->flags)) |
| list_move_tail(®ion->dirty_link, graveyard); |
| } |
| |
| static void netfs_clear_up_merged_regions(struct netfs_i_context *ctx, |
| struct list_head *graveyard) |
| { |
| struct netfs_dirty_region *p; |
| |
| while (!list_empty(graveyard)) { |
| p = list_first_entry(graveyard, struct netfs_dirty_region, dirty_link); |
| list_del_init(&p->dirty_link); |
| smp_store_release(&p->state, NETFS_REGION_IS_COMPLETE); |
| trace_netfs_dirty(ctx, p, NULL, netfs_dirty_trace_complete); |
| wake_up_var(&p->state); |
| netfs_put_dirty_region(ctx, p, netfs_region_trace_put_merged); |
| } |
| } |
| |
| /* |
| * We completed the modification phase of a write. Fix up the dirty list. |
| */ |
| static void netfs_commit_write(struct netfs_i_context *ctx, |
| struct netfs_dirty_region *region) |
| { |
| LIST_HEAD(graveyard); |
| |
| spin_lock(&ctx->lock); |
| smp_store_release(®ion->state, NETFS_REGION_IS_DIRTY); |
| trace_netfs_dirty(ctx, region, NULL, netfs_dirty_trace_commit); |
| wake_up_var(®ion->state); |
| |
| netfs_merge_dirty_region(ctx, region, &graveyard); |
| spin_unlock(&ctx->lock); |
| netfs_clear_up_merged_regions(ctx, &graveyard); |
| } |
| |
| enum netfs_handle_nonuptodate { |
| NETFS_FOLIO_IS_UPTODATE, /* Folio is uptodate already */ |
| NETFS_JUST_PREFETCH, /* We have to read the folio anyway */ |
| NETFS_WHOLE_FOLIO_MODIFY, /* We're going to overwrite the whole folio */ |
| NETFS_MODIFY_AND_CLEAR, /* We can assume there is no data to be downloaded. */ |
| }; |
| |
| /* |
| * Decide how we should handle a non-uptodate folio that we want to modify. We |
| * might be attempting to do write-streaming, in which case we don't want to a |
| * local RMW cycle if we can avoid it. If we're doing local caching or content |
| * crypto, we award that priority over avoiding RMW. If the file is open |
| * readably, then we also assume that we may want to written what we wrote. |
| */ |
| static enum netfs_handle_nonuptodate netfs_handle_nonuptodate_folio(struct netfs_i_context *ctx, |
| struct file *file, |
| struct folio *folio, |
| size_t offset, |
| size_t len, |
| bool always_fill) |
| { |
| size_t min_bsize = 1UL << ctx->min_bshift; |
| loff_t pos = folio_file_pos(folio); |
| |
| _enter("f=%lx,z=%llx", ctx->flags, ctx->zero_point); |
| |
| if (folio_test_uptodate(folio)) |
| return NETFS_FOLIO_IS_UPTODATE; |
| |
| if (test_bit(NETFS_ICTX_NEW_CONTENT, &ctx->flags) || |
| pos >= ctx->zero_point) |
| return NETFS_MODIFY_AND_CLEAR; |
| |
| if (always_fill) |
| return NETFS_JUST_PREFETCH; |
| |
| if (offset == 0 && |
| len >= folio_size(folio) && |
| len >= min_bsize) |
| return NETFS_WHOLE_FOLIO_MODIFY; |
| |
| if (file->f_mode & FMODE_READ || |
| test_bit(NETFS_ICTX_DO_RMW, &ctx->flags)) |
| return NETFS_JUST_PREFETCH; |
| |
| if (netfs_i_cookie(file_inode(file)) || |
| min_bsize > 0) |
| return NETFS_JUST_PREFETCH; |
| |
| /* TODO: Handle streaming writes where we avoid doing client-side RMW |
| * by not bringing pages fully uptodate. |
| * |
| * TODO: Consider doing a streaming write if we're about to completely |
| * overwrite a number of blocks. Could also do a streaming write if |
| * we're willing to do one or more reads to fill up the edges of a |
| * partially modified block prior to writing it back. |
| */ |
| return NETFS_JUST_PREFETCH; |
| } |
| |
| /* |
| * Write data into a prereserved region of the pagecache attached to a netfs |
| * inode. |
| */ |
| static ssize_t netfs_perform_write(struct netfs_dirty_region *region, |
| struct kiocb *iocb, struct iov_iter *i) |
| { |
| struct file *file = iocb->ki_filp; |
| struct netfs_i_context *ctx = netfs_i_context(file_inode(file)); |
| struct folio *folio; |
| enum netfs_handle_nonuptodate nupt; |
| ssize_t written = 0, ret; |
| loff_t new_pos, i_size; |
| bool always_fill = false; |
| |
| ret = ctx->ops->validate_for_write(file_inode(file), region); |
| if (ret < 0) |
| return ret; |
| |
| do { |
| size_t plen; |
| size_t offset; /* Offset into pagecache folio */ |
| size_t bytes; /* Bytes to write to folio */ |
| size_t copied; /* Bytes copied from user */ |
| bool relock = false; |
| |
| folio = netfs_grab_folio_for_write(file->f_mapping, |
| region->dirty.end, |
| iov_iter_count(i)); |
| if (!folio) |
| return -ENOMEM; |
| |
| plen = folio_size(folio); |
| offset = region->dirty.end - folio_file_pos(folio); |
| bytes = min_t(size_t, plen - offset, iov_iter_count(i)); |
| |
| if (!folio_test_uptodate(folio)) { |
| folio_unlock(folio); /* Avoid deadlocking fault-in */ |
| relock = true; |
| } |
| |
| /* Bring in the user page that we will copy from _first_. |
| * Otherwise there's a nasty deadlock on copying from the |
| * same page as we're writing to, without it being marked |
| * up-to-date. |
| * |
| * Not only is this an optimisation, but it is also required |
| * to check that the address is actually valid, when atomic |
| * usercopies are used, below. |
| */ |
| if (unlikely(fault_in_iov_iter_readable(i, bytes))) { |
| ret = -EFAULT; |
| goto error_folio; |
| } |
| |
| if (fatal_signal_pending(current)) { |
| ret = -EINTR; |
| goto error_folio; |
| } |
| |
| if (relock) { |
| ret = folio_lock_killable(folio); |
| if (ret < 0) |
| goto error_folio; |
| } |
| |
| redo_prefetch: |
| /* See if we need to prefetch the area we're going to modify. |
| * We need to do this before we get a lock on the folio in case |
| * there's more than one writer competing for the same cache |
| * block. |
| */ |
| nupt = netfs_handle_nonuptodate_folio(ctx, file, folio, |
| offset, bytes, always_fill); |
| _debug("nupt %u", nupt); |
| switch (nupt) { |
| case NETFS_JUST_PREFETCH: |
| ret = netfs_prefetch_for_write(file, folio, region, bytes); |
| if (ret < 0) { |
| kdebug("prefetch = %zd", ret); |
| goto error_folio; |
| } |
| nupt = NETFS_FOLIO_IS_UPTODATE; |
| fallthrough; |
| case NETFS_FOLIO_IS_UPTODATE: |
| break; |
| case NETFS_MODIFY_AND_CLEAR: |
| zero_user_segment(&folio->page, 0, offset); |
| break; |
| case NETFS_WHOLE_FOLIO_MODIFY: |
| break; |
| } |
| |
| if (mapping_writably_mapped(folio_file_mapping(folio))) |
| flush_dcache_folio(folio); |
| copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); |
| flush_dcache_folio(folio); |
| |
| /* Deal with a (partially) failed copy */ |
| if (!folio_test_uptodate(folio)) { |
| if (copied == 0) { |
| ret = -EFAULT; |
| goto error_folio; |
| } |
| if (copied < bytes) { |
| iov_iter_revert(i, copied); |
| always_fill = true; |
| goto redo_prefetch; |
| } |
| switch (nupt) { |
| case NETFS_JUST_PREFETCH: |
| case NETFS_FOLIO_IS_UPTODATE: |
| /* We have the folio locked, so it really ought |
| * to be uptodate. |
| */ |
| WARN(true, "Locked folio %lx became non-uptodate\n", |
| folio_index(folio)); |
| ret = -EIO; |
| goto error_folio; |
| case NETFS_MODIFY_AND_CLEAR: |
| zero_user_segment(&folio->page, offset + copied, plen); |
| fallthrough; |
| case NETFS_WHOLE_FOLIO_MODIFY: |
| folio_mark_uptodate(folio); |
| break; |
| } |
| } |
| |
| /* Update the inode size if we moved the EOF marker */ |
| new_pos = region->dirty.end + copied; |
| i_size = i_size_read(file_inode(file)); |
| if (new_pos > i_size) { |
| if (ctx->ops->update_i_size) { |
| ctx->ops->update_i_size(file, new_pos); |
| } else { |
| i_size_write(file_inode(file), new_pos); |
| fscache_update_cookie(ctx->cache, NULL, &new_pos); |
| } |
| } |
| |
| smp_store_release(®ion->dirty.end, new_pos); |
| |
| trace_netfs_dirty(ctx, region, NULL, netfs_dirty_trace_modified); |
| folio_mark_dirty(folio); |
| folio_unlock(folio); |
| folio_put(folio); |
| folio = NULL; |
| |
| cond_resched(); |
| |
| written += copied; |
| |
| balance_dirty_pages_ratelimited(file->f_mapping); |
| } while (iov_iter_count(i)); |
| |
| out: |
| if (likely(written)) { |
| iocb->ki_pos += written; |
| |
| /* Flush and wait for a write that requires immediate synchronisation. */ |
| if (region->type == NETFS_REGION_DSYNC) { |
| kdebug("dsync"); |
| spin_lock(&ctx->lock); |
| netfs_flush_region(ctx, region, netfs_dirty_trace_flush_dsync); |
| spin_unlock(&ctx->lock); |
| |
| ret = wait_on_region(region, NETFS_REGION_IS_COMPLETE); |
| if (ret < 0) |
| written = ret; |
| } |
| } |
| |
| return written ? written : ret; |
| |
| error_folio: |
| folio_unlock(folio); |
| folio_put(folio); |
| goto out; |
| } |
| |
| /** |
| * netfs_file_write_iter - write data to a file |
| * @iocb: IO state structure |
| * @from: iov_iter with data to write |
| * |
| * This is a wrapper around __generic_file_write_iter() to be used by most |
| * filesystems. It takes care of syncing the file in case of O_SYNC file |
| * and acquires i_mutex as needed. |
| * Return: |
| * * negative error code if no data has been written at all of |
| * vfs_fsync_range() failed for a synchronous write |
| * * number of bytes written, even for truncated writes |
| */ |
| ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) |
| { |
| struct netfs_dirty_region *region = NULL; |
| struct file *file = iocb->ki_filp; |
| struct inode *inode = file->f_mapping->host; |
| struct netfs_i_context *ctx = netfs_i_context(inode); |
| struct netfs_range proposal; |
| enum netfs_region_type type; |
| unsigned long flags = 0; |
| ssize_t ret; |
| |
| _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); |
| |
| proposal.start = iocb->ki_pos; |
| proposal.end = iocb->ki_pos + iov_iter_count(from); |
| |
| inode_lock(inode); |
| ret = generic_write_checks(iocb, from); |
| if (ret <= 0) |
| goto error_no_region; |
| |
| if (iocb->ki_flags & IOCB_DIRECT) |
| type = NETFS_REGION_DIO; |
| if (iocb->ki_flags & IOCB_DSYNC) |
| type = NETFS_REGION_DSYNC; |
| else |
| type = NETFS_REGION_ORDINARY; |
| if (iocb->ki_flags & IOCB_SYNC) |
| __set_bit(NETFS_REGION_SYNC, &flags); |
| |
| region = netfs_prepare_region(inode, file, &proposal, type, flags); |
| if (IS_ERR(region)) { |
| ret = PTR_ERR(region); |
| goto error_no_region; |
| } |
| |
| trace_netfs_write_iter(region, iocb, from); |
| |
| /* We can write back this queue in page reclaim */ |
| current->backing_dev_info = inode_to_bdi(inode); |
| ret = file_remove_privs(file); |
| if (ret) |
| goto error; |
| |
| ret = file_update_time(file); |
| if (ret) |
| goto error; |
| |
| ret = netfs_activate_write(ctx, &proposal, region); |
| if (ret < 0) |
| goto error; |
| |
| ret = wait_on_region(region, NETFS_REGION_IS_ACTIVE); |
| if (ret < 0) |
| goto error; |
| |
| /* The region excludes overlapping writes and is used to synchronise |
| * versus flushes. |
| */ |
| if (iocb->ki_flags & IOCB_DIRECT) |
| ret = -EOPNOTSUPP; //netfs_file_direct_write(region, iocb, from); |
| else |
| ret = netfs_perform_write(region, iocb, from); |
| |
| error: |
| netfs_commit_write(ctx, region); |
| inode_unlock(inode); |
| /* TODO: Wait for DSYNC region here. */ |
| netfs_put_dirty_region(ctx, region, netfs_region_trace_put_write_iter); |
| current->backing_dev_info = NULL; |
| return ret; |
| error_no_region: |
| inode_unlock(inode); |
| return ret; |
| } |
| EXPORT_SYMBOL(netfs_file_write_iter); |
| |
| /* |
| * Add a region that's just been read as a region on the dirty list to |
| * schedule a write to the cache. |
| */ |
| static bool netfs_copy_to_cache(struct netfs_read_request *rreq, |
| struct netfs_read_subrequest *subreq) |
| { |
| struct netfs_dirty_region *candidate, *r; |
| struct netfs_i_context *ctx = netfs_i_context(rreq->inode); |
| struct netfs_range proposal; |
| struct list_head *p; |
| int ret; |
| |
| LIST_HEAD(graveyard); |
| |
| ret = netfs_require_flush_group(rreq->inode); |
| if (ret < 0) |
| return false; |
| |
| candidate = netfs_alloc_dirty_region(); |
| if (!candidate) |
| return false; |
| |
| proposal.start = subreq->start; |
| proposal.end = subreq->start + subreq->len; |
| netfs_init_dirty_region(candidate, rreq->inode, NULL, |
| NETFS_REGION_CACHE_COPY, 0, &proposal); |
| |
| spin_lock(&ctx->lock); |
| |
| /* Find a place to insert. There can't be any dirty regions |
| * overlapping with the region we're adding. |
| */ |
| list_for_each(p, &ctx->dirty_regions) { |
| r = list_entry(p, struct netfs_dirty_region, dirty_link); |
| if (r->bounds.end <= candidate->bounds.start) |
| continue; |
| if (r->bounds.start >= candidate->bounds.end) |
| break; |
| } |
| |
| list_add_tail(&candidate->dirty_link, p); |
| netfs_merge_dirty_region(ctx, candidate, &graveyard); |
| |
| spin_unlock(&ctx->lock); |
| netfs_clear_up_merged_regions(ctx, &graveyard); |
| netfs_put_dirty_region(ctx, candidate, netfs_region_trace_put_copy); |
| return true; |
| } |
| |
| /* |
| * If we downloaded some data and it now needs writing to the cache, we add it |
| * to the dirty region list and let that flush it. This way it can get merged |
| * with writes. |
| * |
| * We inherit a ref from the caller. |
| */ |
| void netfs_rreq_do_write_to_cache(struct netfs_read_request *rreq) |
| { |
| struct netfs_read_subrequest *subreq, *next, *p; |
| |
| trace_netfs_rreq(rreq, netfs_rreq_trace_write); |
| |
| list_for_each_entry_safe(subreq, p, &rreq->subrequests, rreq_link) { |
| if (!test_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags)) { |
| list_del_init(&subreq->rreq_link); |
| netfs_put_subrequest(subreq, false); |
| } |
| } |
| |
| list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { |
| /* Amalgamate adjacent writes */ |
| while (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { |
| next = list_next_entry(subreq, rreq_link); |
| if (next->start != subreq->start + subreq->len) |
| break; |
| subreq->len += next->len; |
| list_del_init(&next->rreq_link); |
| netfs_put_subrequest(next, false); |
| } |
| |
| netfs_copy_to_cache(rreq, subreq); |
| } |
| |
| netfs_rreq_completed(rreq, false); |
| } |