blob: fb52aa0600930102e8f519396b2c7298229c5889 [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2008 Oracle. All rights reserved.
*/
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/list_sort.h>
#include <linux/iversion.h>
#include "misc.h"
#include "ctree.h"
#include "tree-log.h"
#include "disk-io.h"
#include "locking.h"
#include "print-tree.h"
#include "backref.h"
#include "compression.h"
#include "qgroup.h"
#include "block-group.h"
#include "space-info.h"
#include "zoned.h"
#include "inode-item.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
#include "root-tree.h"
#include "dir-item.h"
#include "file-item.h"
#include "file.h"
#include "orphan.h"
#include "tree-checker.h"
#define MAX_CONFLICT_INODES 10
/* magic values for the inode_only field in btrfs_log_inode:
*
* LOG_INODE_ALL means to log everything
* LOG_INODE_EXISTS means to log just enough to recreate the inode
* during log replay
*/
enum {
LOG_INODE_ALL,
LOG_INODE_EXISTS,
};
/*
* directory trouble cases
*
* 1) on rename or unlink, if the inode being unlinked isn't in the fsync
* log, we must force a full commit before doing an fsync of the directory
* where the unlink was done.
* ---> record transid of last unlink/rename per directory
*
* mkdir foo/some_dir
* normal commit
* rename foo/some_dir foo2/some_dir
* mkdir foo/some_dir
* fsync foo/some_dir/some_file
*
* The fsync above will unlink the original some_dir without recording
* it in its new location (foo2). After a crash, some_dir will be gone
* unless the fsync of some_file forces a full commit
*
* 2) we must log any new names for any file or dir that is in the fsync
* log. ---> check inode while renaming/linking.
*
* 2a) we must log any new names for any file or dir during rename
* when the directory they are being removed from was logged.
* ---> check inode and old parent dir during rename
*
* 2a is actually the more important variant. With the extra logging
* a crash might unlink the old name without recreating the new one
*
* 3) after a crash, we must go through any directories with a link count
* of zero and redo the rm -rf
*
* mkdir f1/foo
* normal commit
* rm -rf f1/foo
* fsync(f1)
*
* The directory f1 was fully removed from the FS, but fsync was never
* called on f1, only its parent dir. After a crash the rm -rf must
* be replayed. This must be able to recurse down the entire
* directory tree. The inode link count fixup code takes care of the
* ugly details.
*/
/*
* stages for the tree walking. The first
* stage (0) is to only pin down the blocks we find
* the second stage (1) is to make sure that all the inodes
* we find in the log are created in the subvolume.
*
* The last stage is to deal with directories and links and extents
* and all the other fun semantics
*/
enum {
LOG_WALK_PIN_ONLY,
LOG_WALK_REPLAY_INODES,
LOG_WALK_REPLAY_DIR_INDEX,
LOG_WALK_REPLAY_ALL,
};
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
int inode_only,
struct btrfs_log_ctx *ctx);
static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid);
static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_root *log,
struct btrfs_path *path,
u64 dirid, int del_all);
static void wait_log_commit(struct btrfs_root *root, int transid);
/*
* tree logging is a special write ahead log used to make sure that
* fsyncs and O_SYNCs can happen without doing full tree commits.
*
* Full tree commits are expensive because they require commonly
* modified blocks to be recowed, creating many dirty pages in the
* extent tree an 4x-6x higher write load than ext3.
*
* Instead of doing a tree commit on every fsync, we use the
* key ranges and transaction ids to find items for a given file or directory
* that have changed in this transaction. Those items are copied into
* a special tree (one per subvolume root), that tree is written to disk
* and then the fsync is considered complete.
*
* After a crash, items are copied out of the log-tree back into the
* subvolume tree. Any file data extents found are recorded in the extent
* allocation tree, and the log-tree freed.
*
* The log tree is read three times, once to pin down all the extents it is
* using in ram and once, once to create all the inodes logged in the tree
* and once to do all the other items.
*/
/*
* start a sub transaction and setup the log tree
* this increments the log tree writer count to make the people
* syncing the tree wait for us to finish
*/
static int start_log_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_log_ctx *ctx)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *tree_root = fs_info->tree_root;
const bool zoned = btrfs_is_zoned(fs_info);
int ret = 0;
bool created = false;
/*
* First check if the log root tree was already created. If not, create
* it before locking the root's log_mutex, just to keep lockdep happy.
*/
if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
mutex_lock(&tree_root->log_mutex);
if (!fs_info->log_root_tree) {
ret = btrfs_init_log_root_tree(trans, fs_info);
if (!ret) {
set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
created = true;
}
}
mutex_unlock(&tree_root->log_mutex);
if (ret)
return ret;
}
mutex_lock(&root->log_mutex);
again:
if (root->log_root) {
int index = (root->log_transid + 1) % 2;
if (btrfs_need_log_full_commit(trans)) {
ret = BTRFS_LOG_FORCE_COMMIT;
goto out;
}
if (zoned && atomic_read(&root->log_commit[index])) {
wait_log_commit(root, root->log_transid - 1);
goto again;
}
if (!root->log_start_pid) {
clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
root->log_start_pid = current->pid;
} else if (root->log_start_pid != current->pid) {
set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
}
} else {
/*
* This means fs_info->log_root_tree was already created
* for some other FS trees. Do the full commit not to mix
* nodes from multiple log transactions to do sequential
* writing.
*/
if (zoned && !created) {
ret = BTRFS_LOG_FORCE_COMMIT;
goto out;
}
ret = btrfs_add_log_tree(trans, root);
if (ret)
goto out;
set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
root->log_start_pid = current->pid;
}
atomic_inc(&root->log_writers);
if (!ctx->logging_new_name) {
int index = root->log_transid % 2;
list_add_tail(&ctx->list, &root->log_ctxs[index]);
ctx->log_transid = root->log_transid;
}
out:
mutex_unlock(&root->log_mutex);
return ret;
}
/*
* returns 0 if there was a log transaction running and we were able
* to join, or returns -ENOENT if there were not transactions
* in progress
*/
static int join_running_log_trans(struct btrfs_root *root)
{
const bool zoned = btrfs_is_zoned(root->fs_info);
int ret = -ENOENT;
if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
return ret;
mutex_lock(&root->log_mutex);
again:
if (root->log_root) {
int index = (root->log_transid + 1) % 2;
ret = 0;
if (zoned && atomic_read(&root->log_commit[index])) {
wait_log_commit(root, root->log_transid - 1);
goto again;
}
atomic_inc(&root->log_writers);
}
mutex_unlock(&root->log_mutex);
return ret;
}
/*
* This either makes the current running log transaction wait
* until you call btrfs_end_log_trans() or it makes any future
* log transactions wait until you call btrfs_end_log_trans()
*/
void btrfs_pin_log_trans(struct btrfs_root *root)
{
atomic_inc(&root->log_writers);
}
/*
* indicate we're done making changes to the log tree
* and wake up anyone waiting to do a sync
*/
void btrfs_end_log_trans(struct btrfs_root *root)
{
if (atomic_dec_and_test(&root->log_writers)) {
/* atomic_dec_and_test implies a barrier */
cond_wake_up_nomb(&root->log_writer_wait);
}
}
static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
{
filemap_fdatawait_range(buf->pages[0]->mapping,
buf->start, buf->start + buf->len - 1);
}
/*
* the walk control struct is used to pass state down the chain when
* processing the log tree. The stage field tells us which part
* of the log tree processing we are currently doing. The others
* are state fields used for that specific part
*/
struct walk_control {
/* should we free the extent on disk when done? This is used
* at transaction commit time while freeing a log tree
*/
int free;
/* pin only walk, we record which extents on disk belong to the
* log trees
*/
int pin;
/* what stage of the replay code we're currently in */
int stage;
/*
* Ignore any items from the inode currently being processed. Needs
* to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
* the LOG_WALK_REPLAY_INODES stage.
*/
bool ignore_cur_inode;
/* the root we are currently replaying */
struct btrfs_root *replay_dest;
/* the trans handle for the current replay */
struct btrfs_trans_handle *trans;
/* the function that gets used to process blocks we find in the
* tree. Note the extent_buffer might not be up to date when it is
* passed in, and it must be checked or read if you need the data
* inside it
*/
int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
struct walk_control *wc, u64 gen, int level);
};
/*
* process_func used to pin down extents, write them or wait on them
*/
static int process_one_buffer(struct btrfs_root *log,
struct extent_buffer *eb,
struct walk_control *wc, u64 gen, int level)
{
struct btrfs_fs_info *fs_info = log->fs_info;
int ret = 0;
/*
* If this fs is mixed then we need to be able to process the leaves to
* pin down any logged extents, so we have to read the block.
*/
if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
struct btrfs_tree_parent_check check = {
.level = level,
.transid = gen
};
ret = btrfs_read_extent_buffer(eb, &check);
if (ret)
return ret;
}
if (wc->pin) {
ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
eb->len);
if (ret)
return ret;
if (btrfs_buffer_uptodate(eb, gen, 0) &&
btrfs_header_level(eb) == 0)
ret = btrfs_exclude_logged_extents(eb);
}
return ret;
}
/*
* Item overwrite used by replay and tree logging. eb, slot and key all refer
* to the src data we are copying out.
*
* root is the tree we are copying into, and path is a scratch
* path for use in this function (it should be released on entry and
* will be released on exit).
*
* If the key is already in the destination tree the existing item is
* overwritten. If the existing item isn't big enough, it is extended.
* If it is too large, it is truncated.
*
* If the key isn't in the destination yet, a new item is inserted.
*/
static int overwrite_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct extent_buffer *eb, int slot,
struct btrfs_key *key)
{
int ret;
u32 item_size;
u64 saved_i_size = 0;
int save_old_i_size = 0;
unsigned long src_ptr;
unsigned long dst_ptr;
bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
/*
* This is only used during log replay, so the root is always from a
* fs/subvolume tree. In case we ever need to support a log root, then
* we'll have to clone the leaf in the path, release the path and use
* the leaf before writing into the log tree. See the comments at
* copy_items() for more details.
*/
ASSERT(root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
item_size = btrfs_item_size(eb, slot);
src_ptr = btrfs_item_ptr_offset(eb, slot);
/* Look for the key in the destination tree. */
ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
if (ret < 0)
return ret;
if (ret == 0) {
char *src_copy;
char *dst_copy;
u32 dst_size = btrfs_item_size(path->nodes[0],
path->slots[0]);
if (dst_size != item_size)
goto insert;
if (item_size == 0) {
btrfs_release_path(path);
return 0;
}
dst_copy = kmalloc(item_size, GFP_NOFS);
src_copy = kmalloc(item_size, GFP_NOFS);
if (!dst_copy || !src_copy) {
btrfs_release_path(path);
kfree(dst_copy);
kfree(src_copy);
return -ENOMEM;
}
read_extent_buffer(eb, src_copy, src_ptr, item_size);
dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
item_size);
ret = memcmp(dst_copy, src_copy, item_size);
kfree(dst_copy);
kfree(src_copy);
/*
* they have the same contents, just return, this saves
* us from cowing blocks in the destination tree and doing
* extra writes that may not have been done by a previous
* sync
*/
if (ret == 0) {
btrfs_release_path(path);
return 0;
}
/*
* We need to load the old nbytes into the inode so when we
* replay the extents we've logged we get the right nbytes.
*/
if (inode_item) {
struct btrfs_inode_item *item;
u64 nbytes;
u32 mode;
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
nbytes = btrfs_inode_nbytes(path->nodes[0], item);
item = btrfs_item_ptr(eb, slot,
struct btrfs_inode_item);
btrfs_set_inode_nbytes(eb, item, nbytes);
/*
* If this is a directory we need to reset the i_size to
* 0 so that we can set it up properly when replaying
* the rest of the items in this log.
*/
mode = btrfs_inode_mode(eb, item);
if (S_ISDIR(mode))
btrfs_set_inode_size(eb, item, 0);
}
} else if (inode_item) {
struct btrfs_inode_item *item;
u32 mode;
/*
* New inode, set nbytes to 0 so that the nbytes comes out
* properly when we replay the extents.
*/
item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
btrfs_set_inode_nbytes(eb, item, 0);
/*
* If this is a directory we need to reset the i_size to 0 so
* that we can set it up properly when replaying the rest of
* the items in this log.
*/
mode = btrfs_inode_mode(eb, item);
if (S_ISDIR(mode))
btrfs_set_inode_size(eb, item, 0);
}
insert:
btrfs_release_path(path);
/* try to insert the key into the destination tree */
path->skip_release_on_error = 1;
ret = btrfs_insert_empty_item(trans, root, path,
key, item_size);
path->skip_release_on_error = 0;
/* make sure any existing item is the correct size */
if (ret == -EEXIST || ret == -EOVERFLOW) {
u32 found_size;
found_size = btrfs_item_size(path->nodes[0],
path->slots[0]);
if (found_size > item_size)
btrfs_truncate_item(path, item_size, 1);
else if (found_size < item_size)
btrfs_extend_item(path, item_size - found_size);
} else if (ret) {
return ret;
}
dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
path->slots[0]);
/* don't overwrite an existing inode if the generation number
* was logged as zero. This is done when the tree logging code
* is just logging an inode to make sure it exists after recovery.
*
* Also, don't overwrite i_size on directories during replay.
* log replay inserts and removes directory items based on the
* state of the tree found in the subvolume, and i_size is modified
* as it goes
*/
if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
struct btrfs_inode_item *src_item;
struct btrfs_inode_item *dst_item;
src_item = (struct btrfs_inode_item *)src_ptr;
dst_item = (struct btrfs_inode_item *)dst_ptr;
if (btrfs_inode_generation(eb, src_item) == 0) {
struct extent_buffer *dst_eb = path->nodes[0];
const u64 ino_size = btrfs_inode_size(eb, src_item);
/*
* For regular files an ino_size == 0 is used only when
* logging that an inode exists, as part of a directory
* fsync, and the inode wasn't fsynced before. In this
* case don't set the size of the inode in the fs/subvol
* tree, otherwise we would be throwing valid data away.
*/
if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
ino_size != 0)
btrfs_set_inode_size(dst_eb, dst_item, ino_size);
goto no_copy;
}
if (S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
save_old_i_size = 1;
saved_i_size = btrfs_inode_size(path->nodes[0],
dst_item);
}
}
copy_extent_buffer(path->nodes[0], eb, dst_ptr,
src_ptr, item_size);
if (save_old_i_size) {
struct btrfs_inode_item *dst_item;
dst_item = (struct btrfs_inode_item *)dst_ptr;
btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
}
/* make sure the generation is filled in */
if (key->type == BTRFS_INODE_ITEM_KEY) {
struct btrfs_inode_item *dst_item;
dst_item = (struct btrfs_inode_item *)dst_ptr;
if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
btrfs_set_inode_generation(path->nodes[0], dst_item,
trans->transid);
}
}
no_copy:
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_release_path(path);
return 0;
}
static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
struct fscrypt_str *name)
{
char *buf;
buf = kmalloc(len, GFP_NOFS);
if (!buf)
return -ENOMEM;
read_extent_buffer(eb, buf, (unsigned long)start, len);
name->name = buf;
name->len = len;
return 0;
}
/*
* simple helper to read an inode off the disk from a given root
* This can only be called for subvolume roots and not for the log
*/
static noinline struct inode *read_one_inode(struct btrfs_root *root,
u64 objectid)
{
struct inode *inode;
inode = btrfs_iget(root->fs_info->sb, objectid, root);
if (IS_ERR(inode))
inode = NULL;
return inode;
}
/* replays a single extent in 'eb' at 'slot' with 'key' into the
* subvolume 'root'. path is released on entry and should be released
* on exit.
*
* extents in the log tree have not been allocated out of the extent
* tree yet. So, this completes the allocation, taking a reference
* as required if the extent already exists or creating a new extent
* if it isn't in the extent allocation tree yet.
*
* The extent is inserted into the file, dropping any existing extents
* from the file that overlap the new one.
*/
static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct extent_buffer *eb, int slot,
struct btrfs_key *key)
{
struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_fs_info *fs_info = root->fs_info;
int found_type;
u64 extent_end;
u64 start = key->offset;
u64 nbytes = 0;
struct btrfs_file_extent_item *item;
struct inode *inode = NULL;
unsigned long size;
int ret = 0;
item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
found_type = btrfs_file_extent_type(eb, item);
if (found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
nbytes = btrfs_file_extent_num_bytes(eb, item);
extent_end = start + nbytes;
/*
* We don't add to the inodes nbytes if we are prealloc or a
* hole.
*/
if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
nbytes = 0;
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
size = btrfs_file_extent_ram_bytes(eb, item);
nbytes = btrfs_file_extent_ram_bytes(eb, item);
extent_end = ALIGN(start + size,
fs_info->sectorsize);
} else {
ret = 0;
goto out;
}
inode = read_one_inode(root, key->objectid);
if (!inode) {
ret = -EIO;
goto out;
}
/*
* first check to see if we already have this extent in the
* file. This must be done before the btrfs_drop_extents run
* so we don't try to drop this extent.
*/
ret = btrfs_lookup_file_extent(trans, root, path,
btrfs_ino(BTRFS_I(inode)), start, 0);
if (ret == 0 &&
(found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
struct btrfs_file_extent_item cmp1;
struct btrfs_file_extent_item cmp2;
struct btrfs_file_extent_item *existing;
struct extent_buffer *leaf;
leaf = path->nodes[0];
existing = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
read_extent_buffer(eb, &cmp1, (unsigned long)item,
sizeof(cmp1));
read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
sizeof(cmp2));
/*
* we already have a pointer to this exact extent,
* we don't have to do anything
*/
if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
btrfs_release_path(path);
goto out;
}
}
btrfs_release_path(path);
/* drop any overlapping extents */
drop_args.start = start;
drop_args.end = extent_end;
drop_args.drop_cache = true;
ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
if (ret)
goto out;
if (found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
u64 offset;
unsigned long dest_offset;
struct btrfs_key ins;
if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
btrfs_fs_incompat(fs_info, NO_HOLES))
goto update_inode;
ret = btrfs_insert_empty_item(trans, root, path, key,
sizeof(*item));
if (ret)
goto out;
dest_offset = btrfs_item_ptr_offset(path->nodes[0],
path->slots[0]);
copy_extent_buffer(path->nodes[0], eb, dest_offset,
(unsigned long)item, sizeof(*item));
ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
ins.type = BTRFS_EXTENT_ITEM_KEY;
offset = key->offset - btrfs_file_extent_offset(eb, item);
/*
* Manually record dirty extent, as here we did a shallow
* file extent item copy and skip normal backref update,
* but modifying extent tree all by ourselves.
* So need to manually record dirty extent for qgroup,
* as the owner of the file extent changed from log tree
* (doesn't affect qgroup) to fs/file tree(affects qgroup)
*/
ret = btrfs_qgroup_trace_extent(trans,
btrfs_file_extent_disk_bytenr(eb, item),
btrfs_file_extent_disk_num_bytes(eb, item));
if (ret < 0)
goto out;
if (ins.objectid > 0) {
struct btrfs_ref ref = { 0 };
u64 csum_start;
u64 csum_end;
LIST_HEAD(ordered_sums);
/*
* is this extent already allocated in the extent
* allocation tree? If so, just add a reference
*/
ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
ins.offset);
if (ret < 0) {
goto out;
} else if (ret == 0) {
btrfs_init_generic_ref(&ref,
BTRFS_ADD_DELAYED_REF,
ins.objectid, ins.offset, 0);
btrfs_init_data_ref(&ref,
root->root_key.objectid,
key->objectid, offset, 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret)
goto out;
} else {
/*
* insert the extent pointer in the extent
* allocation tree
*/
ret = btrfs_alloc_logged_file_extent(trans,
root->root_key.objectid,
key->objectid, offset, &ins);
if (ret)
goto out;
}
btrfs_release_path(path);
if (btrfs_file_extent_compression(eb, item)) {
csum_start = ins.objectid;
csum_end = csum_start + ins.offset;
} else {
csum_start = ins.objectid +
btrfs_file_extent_offset(eb, item);
csum_end = csum_start +
btrfs_file_extent_num_bytes(eb, item);
}
ret = btrfs_lookup_csums_list(root->log_root,
csum_start, csum_end - 1,
&ordered_sums, 0, false);
if (ret)
goto out;
/*
* Now delete all existing cums in the csum root that
* cover our range. We do this because we can have an
* extent that is completely referenced by one file
* extent item and partially referenced by another
* file extent item (like after using the clone or
* extent_same ioctls). In this case if we end up doing
* the replay of the one that partially references the
* extent first, and we do not do the csum deletion
* below, we can get 2 csum items in the csum tree that
* overlap each other. For example, imagine our log has
* the two following file extent items:
*
* key (257 EXTENT_DATA 409600)
* extent data disk byte 12845056 nr 102400
* extent data offset 20480 nr 20480 ram 102400
*
* key (257 EXTENT_DATA 819200)
* extent data disk byte 12845056 nr 102400
* extent data offset 0 nr 102400 ram 102400
*
* Where the second one fully references the 100K extent
* that starts at disk byte 12845056, and the log tree
* has a single csum item that covers the entire range
* of the extent:
*
* key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
*
* After the first file extent item is replayed, the
* csum tree gets the following csum item:
*
* key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
*
* Which covers the 20K sub-range starting at offset 20K
* of our extent. Now when we replay the second file
* extent item, if we do not delete existing csum items
* that cover any of its blocks, we end up getting two
* csum items in our csum tree that overlap each other:
*
* key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
* key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
*
* Which is a problem, because after this anyone trying
* to lookup up for the checksum of any block of our
* extent starting at an offset of 40K or higher, will
* end up looking at the second csum item only, which
* does not contain the checksum for any block starting
* at offset 40K or higher of our extent.
*/
while (!list_empty(&ordered_sums)) {
struct btrfs_ordered_sum *sums;
struct btrfs_root *csum_root;
sums = list_entry(ordered_sums.next,
struct btrfs_ordered_sum,
list);
csum_root = btrfs_csum_root(fs_info,
sums->bytenr);
if (!ret)
ret = btrfs_del_csums(trans, csum_root,
sums->bytenr,
sums->len);
if (!ret)
ret = btrfs_csum_file_blocks(trans,
csum_root,
sums);
list_del(&sums->list);
kfree(sums);
}
if (ret)
goto out;
} else {
btrfs_release_path(path);
}
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
/* inline extents are easy, we just overwrite them */
ret = overwrite_item(trans, root, path, eb, slot, key);
if (ret)
goto out;
}
ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
extent_end - start);
if (ret)
goto out;
update_inode:
btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
out:
iput(inode);
return ret;
}
static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir,
struct btrfs_inode *inode,
const struct fscrypt_str *name)
{
int ret;
ret = btrfs_unlink_inode(trans, dir, inode, name);
if (ret)
return ret;
/*
* Whenever we need to check if a name exists or not, we check the
* fs/subvolume tree. So after an unlink we must run delayed items, so
* that future checks for a name during log replay see that the name
* does not exists anymore.
*/
return btrfs_run_delayed_items(trans);
}
/*
* when cleaning up conflicts between the directory names in the
* subvolume, directory names in the log and directory names in the
* inode back references, we may have to unlink inodes from directories.
*
* This is a helper function to do the unlink of a specific directory
* item
*/
static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_inode *dir,
struct btrfs_dir_item *di)
{
struct btrfs_root *root = dir->root;
struct inode *inode;
struct fscrypt_str name;
struct extent_buffer *leaf;
struct btrfs_key location;
int ret;
leaf = path->nodes[0];
btrfs_dir_item_key_to_cpu(leaf, di, &location);
ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name);
if (ret)
return -ENOMEM;
btrfs_release_path(path);
inode = read_one_inode(root, location.objectid);
if (!inode) {
ret = -EIO;
goto out;
}
ret = link_to_fixup_dir(trans, root, path, location.objectid);
if (ret)
goto out;
ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name);
out:
kfree(name.name);
iput(inode);
return ret;
}
/*
* See if a given name and sequence number found in an inode back reference are
* already in a directory and correctly point to this inode.
*
* Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
* exists.
*/
static noinline int inode_in_dir(struct btrfs_root *root,
struct btrfs_path *path,
u64 dirid, u64 objectid, u64 index,
struct fscrypt_str *name)
{
struct btrfs_dir_item *di;
struct btrfs_key location;
int ret = 0;
di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
index, name, 0);
if (IS_ERR(di)) {
ret = PTR_ERR(di);
goto out;
} else if (di) {
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
if (location.objectid != objectid)
goto out;
} else {
goto out;
}
btrfs_release_path(path);
di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, 0);
if (IS_ERR(di)) {
ret = PTR_ERR(di);
goto out;
} else if (di) {
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
if (location.objectid == objectid)
ret = 1;
}
out:
btrfs_release_path(path);
return ret;
}
/*
* helper function to check a log tree for a named back reference in
* an inode. This is used to decide if a back reference that is
* found in the subvolume conflicts with what we find in the log.
*
* inode backreferences may have multiple refs in a single item,
* during replay we process one reference at a time, and we don't
* want to delete valid links to a file from the subvolume if that
* link is also in the log.
*/
static noinline int backref_in_log(struct btrfs_root *log,
struct btrfs_key *key,
u64 ref_objectid,
const struct fscrypt_str *name)
{
struct btrfs_path *path;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
if (ret < 0) {
goto out;
} else if (ret == 1) {
ret = 0;
goto out;
}
if (key->type == BTRFS_INODE_EXTREF_KEY)
ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
path->slots[0],
ref_objectid, name);
else
ret = !!btrfs_find_name_in_backref(path->nodes[0],
path->slots[0], name);
out:
btrfs_free_path(path);
return ret;
}
static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_root *log_root,
struct btrfs_inode *dir,
struct btrfs_inode *inode,
u64 inode_objectid, u64 parent_objectid,
u64 ref_index, struct fscrypt_str *name)
{
int ret;
struct extent_buffer *leaf;
struct btrfs_dir_item *di;
struct btrfs_key search_key;
struct btrfs_inode_extref *extref;
again:
/* Search old style refs */
search_key.objectid = inode_objectid;
search_key.type = BTRFS_INODE_REF_KEY;
search_key.offset = parent_objectid;
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret == 0) {
struct btrfs_inode_ref *victim_ref;
unsigned long ptr;
unsigned long ptr_end;
leaf = path->nodes[0];
/* are we trying to overwrite a back ref for the root directory
* if so, just jump out, we're done
*/
if (search_key.objectid == search_key.offset)
return 1;
/* check all the names in this back reference to see
* if they are in the log. if so, we allow them to stay
* otherwise they must be unlinked as a conflict
*/
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
while (ptr < ptr_end) {
struct fscrypt_str victim_name;
victim_ref = (struct btrfs_inode_ref *)ptr;
ret = read_alloc_one_name(leaf, (victim_ref + 1),
btrfs_inode_ref_name_len(leaf, victim_ref),
&victim_name);
if (ret)
return ret;
ret = backref_in_log(log_root, &search_key,
parent_objectid, &victim_name);
if (ret < 0) {
kfree(victim_name.name);
return ret;
} else if (!ret) {
inc_nlink(&inode->vfs_inode);
btrfs_release_path(path);
ret = unlink_inode_for_log_replay(trans, dir, inode,
&victim_name);
kfree(victim_name.name);
if (ret)
return ret;
goto again;
}
kfree(victim_name.name);
ptr = (unsigned long)(victim_ref + 1) + victim_name.len;
}
}
btrfs_release_path(path);
/* Same search but for extended refs */
extref = btrfs_lookup_inode_extref(NULL, root, path, name,
inode_objectid, parent_objectid, 0,
0);
if (IS_ERR(extref)) {
return PTR_ERR(extref);
} else if (extref) {
u32 item_size;
u32 cur_offset = 0;
unsigned long base;
struct inode *victim_parent;
leaf = path->nodes[0];
item_size = btrfs_item_size(leaf, path->slots[0]);
base = btrfs_item_ptr_offset(leaf, path->slots[0]);
while (cur_offset < item_size) {
struct fscrypt_str victim_name;
extref = (struct btrfs_inode_extref *)(base + cur_offset);
if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
goto next;
ret = read_alloc_one_name(leaf, &extref->name,
btrfs_inode_extref_name_len(leaf, extref),
&victim_name);
if (ret)
return ret;
search_key.objectid = inode_objectid;
search_key.type = BTRFS_INODE_EXTREF_KEY;
search_key.offset = btrfs_extref_hash(parent_objectid,
victim_name.name,
victim_name.len);
ret = backref_in_log(log_root, &search_key,
parent_objectid, &victim_name);
if (ret < 0) {
kfree(victim_name.name);
return ret;
} else if (!ret) {
ret = -ENOENT;
victim_parent = read_one_inode(root,
parent_objectid);
if (victim_parent) {
inc_nlink(&inode->vfs_inode);
btrfs_release_path(path);
ret = unlink_inode_for_log_replay(trans,
BTRFS_I(victim_parent),
inode, &victim_name);
}
iput(victim_parent);
kfree(victim_name.name);
if (ret)
return ret;
goto again;
}
kfree(victim_name.name);
next:
cur_offset += victim_name.len + sizeof(*extref);
}
}
btrfs_release_path(path);
/* look for a conflicting sequence number */
di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
ref_index, name, 0);
if (IS_ERR(di)) {
return PTR_ERR(di);
} else if (di) {
ret = drop_one_dir_item(trans, path, dir, di);
if (ret)
return ret;
}
btrfs_release_path(path);
/* look for a conflicting name */
di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0);
if (IS_ERR(di)) {
return PTR_ERR(di);
} else if (di) {
ret = drop_one_dir_item(trans, path, dir, di);
if (ret)
return ret;
}
btrfs_release_path(path);
return 0;
}
static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
struct fscrypt_str *name, u64 *index,
u64 *parent_objectid)
{
struct btrfs_inode_extref *extref;
int ret;
extref = (struct btrfs_inode_extref *)ref_ptr;
ret = read_alloc_one_name(eb, &extref->name,
btrfs_inode_extref_name_len(eb, extref), name);
if (ret)
return ret;
if (index)
*index = btrfs_inode_extref_index(eb, extref);
if (parent_objectid)
*parent_objectid = btrfs_inode_extref_parent(eb, extref);
return 0;
}
static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
struct fscrypt_str *name, u64 *index)
{
struct btrfs_inode_ref *ref;
int ret;
ref = (struct btrfs_inode_ref *)ref_ptr;
ret = read_alloc_one_name(eb, ref + 1, btrfs_inode_ref_name_len(eb, ref),
name);
if (ret)
return ret;
if (index)
*index = btrfs_inode_ref_index(eb, ref);
return 0;
}
/*
* Take an inode reference item from the log tree and iterate all names from the
* inode reference item in the subvolume tree with the same key (if it exists).
* For any name that is not in the inode reference item from the log tree, do a
* proper unlink of that name (that is, remove its entry from the inode
* reference item and both dir index keys).
*/
static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_inode *inode,
struct extent_buffer *log_eb,
int log_slot,
struct btrfs_key *key)
{
int ret;
unsigned long ref_ptr;
unsigned long ref_end;
struct extent_buffer *eb;
again:
btrfs_release_path(path);
ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
if (ret > 0) {
ret = 0;
goto out;
}
if (ret < 0)
goto out;
eb = path->nodes[0];
ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]);
while (ref_ptr < ref_end) {
struct fscrypt_str name;
u64 parent_id;
if (key->type == BTRFS_INODE_EXTREF_KEY) {
ret = extref_get_fields(eb, ref_ptr, &name,
NULL, &parent_id);
} else {
parent_id = key->offset;
ret = ref_get_fields(eb, ref_ptr, &name, NULL);
}
if (ret)
goto out;
if (key->type == BTRFS_INODE_EXTREF_KEY)
ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
parent_id, &name);
else
ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name);
if (!ret) {
struct inode *dir;
btrfs_release_path(path);
dir = read_one_inode(root, parent_id);
if (!dir) {
ret = -ENOENT;
kfree(name.name);
goto out;
}
ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir),
inode, &name);
kfree(name.name);
iput(dir);
if (ret)
goto out;
goto again;
}
kfree(name.name);
ref_ptr += name.len;
if (key->type == BTRFS_INODE_EXTREF_KEY)
ref_ptr += sizeof(struct btrfs_inode_extref);
else
ref_ptr += sizeof(struct btrfs_inode_ref);
}
ret = 0;
out:
btrfs_release_path(path);
return ret;
}
/*
* replay one inode back reference item found in the log tree.
* eb, slot and key refer to the buffer and key found in the log tree.
* root is the destination we are replaying into, and path is for temp
* use by this function. (it should be released on return).
*/
static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_root *log,
struct btrfs_path *path,
struct extent_buffer *eb, int slot,
struct btrfs_key *key)
{
struct inode *dir = NULL;
struct inode *inode = NULL;
unsigned long ref_ptr;
unsigned long ref_end;
struct fscrypt_str name;
int ret;
int log_ref_ver = 0;
u64 parent_objectid;
u64 inode_objectid;
u64 ref_index = 0;
int ref_struct_size;
ref_ptr = btrfs_item_ptr_offset(eb, slot);
ref_end = ref_ptr + btrfs_item_size(eb, slot);
if (key->type == BTRFS_INODE_EXTREF_KEY) {
struct btrfs_inode_extref *r;
ref_struct_size = sizeof(struct btrfs_inode_extref);
log_ref_ver = 1;
r = (struct btrfs_inode_extref *)ref_ptr;
parent_objectid = btrfs_inode_extref_parent(eb, r);
} else {
ref_struct_size = sizeof(struct btrfs_inode_ref);
parent_objectid = key->offset;
}
inode_objectid = key->objectid;
/*
* it is possible that we didn't log all the parent directories
* for a given inode. If we don't find the dir, just don't
* copy the back ref in. The link count fixup code will take
* care of the rest
*/
dir = read_one_inode(root, parent_objectid);
if (!dir) {
ret = -ENOENT;
goto out;
}
inode = read_one_inode(root, inode_objectid);
if (!inode) {
ret = -EIO;
goto out;
}
while (ref_ptr < ref_end) {
if (log_ref_ver) {
ret = extref_get_fields(eb, ref_ptr, &name,
&ref_index, &parent_objectid);
/*
* parent object can change from one array
* item to another.
*/
if (!dir)
dir = read_one_inode(root, parent_objectid);
if (!dir) {
ret = -ENOENT;
goto out;
}
} else {
ret = ref_get_fields(eb, ref_ptr, &name, &ref_index);
}
if (ret)
goto out;
ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
btrfs_ino(BTRFS_I(inode)), ref_index, &name);
if (ret < 0) {
goto out;
} else if (ret == 0) {
/*
* look for a conflicting back reference in the
* metadata. if we find one we have to unlink that name
* of the file before we add our new link. Later on, we
* overwrite any existing back reference, and we don't
* want to create dangling pointers in the directory.
*/
ret = __add_inode_ref(trans, root, path, log,
BTRFS_I(dir), BTRFS_I(inode),
inode_objectid, parent_objectid,
ref_index, &name);
if (ret) {
if (ret == 1)
ret = 0;
goto out;
}
/* insert our name */
ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
&name, 0, ref_index);
if (ret)
goto out;
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret)
goto out;
}
/* Else, ret == 1, we already have a perfect match, we're done. */
ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len;
kfree(name.name);
name.name = NULL;
if (log_ref_ver) {
iput(dir);
dir = NULL;
}
}
/*
* Before we overwrite the inode reference item in the subvolume tree
* with the item from the log tree, we must unlink all names from the
* parent directory that are in the subvolume's tree inode reference
* item, otherwise we end up with an inconsistent subvolume tree where
* dir index entries exist for a name but there is no inode reference
* item with the same name.
*/
ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
key);
if (ret)
goto out;
/* finally write the back reference in the inode */
ret = overwrite_item(trans, root, path, eb, slot, key);
out:
btrfs_release_path(path);
kfree(name.name);
iput(dir);
iput(inode);
return ret;
}
static int count_inode_extrefs(struct btrfs_root *root,
struct btrfs_inode *inode, struct btrfs_path *path)
{
int ret = 0;
int name_len;
unsigned int nlink = 0;
u32 item_size;
u32 cur_offset = 0;
u64 inode_objectid = btrfs_ino(inode);
u64 offset = 0;
unsigned long ptr;
struct btrfs_inode_extref *extref;
struct extent_buffer *leaf;
while (1) {
ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
&extref, &offset);
if (ret)
break;
leaf = path->nodes[0];
item_size = btrfs_item_size(leaf, path->slots[0]);
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
cur_offset = 0;
while (cur_offset < item_size) {
extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
name_len = btrfs_inode_extref_name_len(leaf, extref);
nlink++;
cur_offset += name_len + sizeof(*extref);
}
offset++;
btrfs_release_path(path);
}
btrfs_release_path(path);
if (ret < 0 && ret != -ENOENT)
return ret;
return nlink;
}
static int count_inode_refs(struct btrfs_root *root,
struct btrfs_inode *inode, struct btrfs_path *path)
{
int ret;
struct btrfs_key key;
unsigned int nlink = 0;
unsigned long ptr;
unsigned long ptr_end;
int name_len;
u64 ino = btrfs_ino(inode);
key.objectid = ino;
key.type = BTRFS_INODE_REF_KEY;
key.offset = (u64)-1;
while (1) {
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
break;
if (ret > 0) {
if (path->slots[0] == 0)
break;
path->slots[0]--;
}
process_slot:
btrfs_item_key_to_cpu(path->nodes[0], &key,
path->slots[0]);
if (key.objectid != ino ||
key.type != BTRFS_INODE_REF_KEY)
break;
ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
ptr_end = ptr + btrfs_item_size(path->nodes[0],
path->slots[0]);
while (ptr < ptr_end) {
struct btrfs_inode_ref *ref;
ref = (struct btrfs_inode_ref *)ptr;
name_len = btrfs_inode_ref_name_len(path->nodes[0],
ref);
ptr = (unsigned long)(ref + 1) + name_len;
nlink++;
}
if (key.offset == 0)
break;
if (path->slots[0] > 0) {
path->slots[0]--;
goto process_slot;
}
key.offset--;
btrfs_release_path(path);
}
btrfs_release_path(path);
return nlink;
}
/*
* There are a few corners where the link count of the file can't
* be properly maintained during replay. So, instead of adding
* lots of complexity to the log code, we just scan the backrefs
* for any file that has been through replay.
*
* The scan will update the link count on the inode to reflect the
* number of back refs found. If it goes down to zero, the iput
* will free the inode.
*/
static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode)
{
struct btrfs_path *path;
int ret;
u64 nlink = 0;
u64 ino = btrfs_ino(BTRFS_I(inode));
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
ret = count_inode_refs(root, BTRFS_I(inode), path);
if (ret < 0)
goto out;
nlink = ret;
ret = count_inode_extrefs(root, BTRFS_I(inode), path);
if (ret < 0)
goto out;
nlink += ret;
ret = 0;
if (nlink != inode->i_nlink) {
set_nlink(inode, nlink);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret)
goto out;
}
BTRFS_I(inode)->index_cnt = (u64)-1;
if (inode->i_nlink == 0) {
if (S_ISDIR(inode->i_mode)) {
ret = replay_dir_deletes(trans, root, NULL, path,
ino, 1);
if (ret)
goto out;
}
ret = btrfs_insert_orphan_item(trans, root, ino);
if (ret == -EEXIST)
ret = 0;
}
out:
btrfs_free_path(path);
return ret;
}
static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path)
{
int ret;
struct btrfs_key key;
struct inode *inode;
key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = (u64)-1;
while (1) {
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
break;
if (ret == 1) {
ret = 0;
if (path->slots[0] == 0)
break;
path->slots[0]--;
}
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
key.type != BTRFS_ORPHAN_ITEM_KEY)
break;
ret = btrfs_del_item(trans, root, path);
if (ret)
break;
btrfs_release_path(path);
inode = read_one_inode(root, key.offset);
if (!inode) {
ret = -EIO;
break;
}
ret = fixup_inode_link_count(trans, root, inode);
iput(inode);
if (ret)
break;
/*
* fixup on a directory may create new entries,
* make sure we always look for the highset possible
* offset
*/
key.offset = (u64)-1;
}
btrfs_release_path(path);
return ret;
}
/*
* record a given inode in the fixup dir so we can check its link
* count when replay is done. The link count is incremented here
* so the inode won't go away until we check it
*/
static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
u64 objectid)
{
struct btrfs_key key;
int ret = 0;
struct inode *inode;
inode = read_one_inode(root, objectid);
if (!inode)
return -EIO;
key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = objectid;
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
btrfs_release_path(path);
if (ret == 0) {
if (!inode->i_nlink)
set_nlink(inode, 1);
else
inc_nlink(inode);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
} else if (ret == -EEXIST) {
ret = 0;
}
iput(inode);
return ret;
}
/*
* when replaying the log for a directory, we only insert names
* for inodes that actually exist. This means an fsync on a directory
* does not implicitly fsync all the new files in it
*/
static noinline int insert_one_name(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 dirid, u64 index,
const struct fscrypt_str *name,
struct btrfs_key *location)
{
struct inode *inode;
struct inode *dir;
int ret;
inode = read_one_inode(root, location->objectid);
if (!inode)
return -ENOENT;
dir = read_one_inode(root, dirid);
if (!dir) {
iput(inode);
return -EIO;
}
ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
1, index);
/* FIXME, put inode into FIXUP list */
iput(inode);
iput(dir);
return ret;
}
static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir,
struct btrfs_path *path,
struct btrfs_dir_item *dst_di,
const struct btrfs_key *log_key,
u8 log_flags,
bool exists)
{
struct btrfs_key found_key;
btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
/* The existing dentry points to the same inode, don't delete it. */
if (found_key.objectid == log_key->objectid &&
found_key.type == log_key->type &&
found_key.offset == log_key->offset &&
btrfs_dir_flags(path->nodes[0], dst_di) == log_flags)
return 1;
/*
* Don't drop the conflicting directory entry if the inode for the new
* entry doesn't exist.
*/
if (!exists)
return 0;
return drop_one_dir_item(trans, path, dir, dst_di);
}
/*
* take a single entry in a log directory item and replay it into
* the subvolume.
*
* if a conflicting item exists in the subdirectory already,
* the inode it points to is unlinked and put into the link count
* fix up tree.
*
* If a name from the log points to a file or directory that does
* not exist in the FS, it is skipped. fsyncs on directories
* do not force down inodes inside that directory, just changes to the
* names or unlinks in a directory.
*
* Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
* non-existing inode) and 1 if the name was replayed.
*/
static noinline int replay_one_name(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct extent_buffer *eb,
struct btrfs_dir_item *di,
struct btrfs_key *key)
{
struct fscrypt_str name;
struct btrfs_dir_item *dir_dst_di;
struct btrfs_dir_item *index_dst_di;
bool dir_dst_matches = false;
bool index_dst_matches = false;
struct btrfs_key log_key;
struct btrfs_key search_key;
struct inode *dir;
u8 log_flags;
bool exists;
int ret;
bool update_size = true;
bool name_added = false;
dir = read_one_inode(root, key->objectid);
if (!dir)
return -EIO;
ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
if (ret)
goto out;
log_flags = btrfs_dir_flags(eb, di);
btrfs_dir_item_key_to_cpu(eb, di, &log_key);
ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
btrfs_release_path(path);
if (ret < 0)
goto out;
exists = (ret == 0);
ret = 0;
dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
&name, 1);
if (IS_ERR(dir_dst_di)) {
ret = PTR_ERR(dir_dst_di);
goto out;
} else if (dir_dst_di) {
ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
dir_dst_di, &log_key,
log_flags, exists);
if (ret < 0)
goto out;
dir_dst_matches = (ret == 1);
}
btrfs_release_path(path);
index_dst_di = btrfs_lookup_dir_index_item(trans, root, path,
key->objectid, key->offset,
&name, 1);
if (IS_ERR(index_dst_di)) {
ret = PTR_ERR(index_dst_di);
goto out;
} else if (index_dst_di) {
ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
index_dst_di, &log_key,
log_flags, exists);
if (ret < 0)
goto out;
index_dst_matches = (ret == 1);
}
btrfs_release_path(path);
if (dir_dst_matches && index_dst_matches) {
ret = 0;
update_size = false;
goto out;
}
/*
* Check if the inode reference exists in the log for the given name,
* inode and parent inode
*/
search_key.objectid = log_key.objectid;
search_key.type = BTRFS_INODE_REF_KEY;
search_key.offset = key->objectid;
ret = backref_in_log(root->log_root, &search_key, 0, &name);
if (ret < 0) {
goto out;
} else if (ret) {
/* The dentry will be added later. */
ret = 0;
update_size = false;
goto out;
}
search_key.objectid = log_key.objectid;
search_key.type = BTRFS_INODE_EXTREF_KEY;
search_key.offset = key->objectid;
ret = backref_in_log(root->log_root, &search_key, key->objectid, &name);
if (ret < 0) {
goto out;
} else if (ret) {
/* The dentry will be added later. */
ret = 0;
update_size = false;
goto out;
}
btrfs_release_path(path);
ret = insert_one_name(trans, root, key->objectid, key->offset,
&name, &log_key);
if (ret && ret != -ENOENT && ret != -EEXIST)
goto out;
if (!ret)
name_added = true;
update_size = false;
ret = 0;
out:
if (!ret && update_size) {
btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2);
ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
}
kfree(name.name);
iput(dir);
if (!ret && name_added)
ret = 1;
return ret;
}
/* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct extent_buffer *eb, int slot,
struct btrfs_key *key)
{
int ret;
struct btrfs_dir_item *di;
/* We only log dir index keys, which only contain a single dir item. */
ASSERT(key->type == BTRFS_DIR_INDEX_KEY);
di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
ret = replay_one_name(trans, root, path, eb, di, key);
if (ret < 0)
return ret;
/*
* If this entry refers to a non-directory (directories can not have a
* link count > 1) and it was added in the transaction that was not
* committed, make sure we fixup the link count of the inode the entry
* points to. Otherwise something like the following would result in a
* directory pointing to an inode with a wrong link that does not account
* for this dir entry:
*
* mkdir testdir
* touch testdir/foo
* touch testdir/bar
* sync
*
* ln testdir/bar testdir/bar_link
* ln testdir/foo testdir/foo_link
* xfs_io -c "fsync" testdir/bar
*
* <power failure>
*
* mount fs, log replay happens
*
* File foo would remain with a link count of 1 when it has two entries
* pointing to it in the directory testdir. This would make it impossible
* to ever delete the parent directory has it would result in stale
* dentries that can never be deleted.
*/
if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) {
struct btrfs_path *fixup_path;
struct btrfs_key di_key;
fixup_path = btrfs_alloc_path();
if (!fixup_path)
return -ENOMEM;
btrfs_dir_item_key_to_cpu(eb, di, &di_key);
ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid);
btrfs_free_path(fixup_path);
}
return ret;
}
/*
* directory replay has two parts. There are the standard directory
* items in the log copied from the subvolume, and range items
* created in the log while the subvolume was logged.
*
* The range items tell us which parts of the key space the log
* is authoritative for. During replay, if a key in the subvolume
* directory is in a logged range item, but not actually in the log
* that means it was deleted from the directory before the fsync
* and should be removed.
*/
static noinline int find_dir_range(struct btrfs_root *root,
struct btrfs_path *path,
u64 dirid,
u64 *start_ret, u64 *end_ret)
{
struct btrfs_key key;
u64 found_end;
struct btrfs_dir_log_item *item;
int ret;
int nritems;
if (*start_ret == (u64)-1)
return 1;
key.objectid = dirid;
key.type = BTRFS_DIR_LOG_INDEX_KEY;
key.offset = *start_ret;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
if (ret > 0) {
if (path->slots[0] == 0)
goto out;
path->slots[0]--;
}
if (ret != 0)
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
ret = 1;
goto next;
}
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_dir_log_item);
found_end = btrfs_dir_log_end(path->nodes[0], item);
if (*start_ret >= key.offset && *start_ret <= found_end) {
ret = 0;
*start_ret = key.offset;
*end_ret = found_end;
goto out;
}
ret = 1;
next:
/* check the next slot in the tree to see if it is a valid item */
nritems = btrfs_header_nritems(path->nodes[0]);
path->slots[0]++;
if (path->slots[0] >= nritems) {
ret = btrfs_next_leaf(root, path);
if (ret)
goto out;
}
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
ret = 1;
goto out;
}
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_dir_log_item);
found_end = btrfs_dir_log_end(path->nodes[0], item);
*start_ret = key.offset;
*end_ret = found_end;
ret = 0;
out:
btrfs_release_path(path);
return ret;
}
/*
* this looks for a given directory item in the log. If the directory
* item is not in the log, the item is removed and the inode it points
* to is unlinked
*/
static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *log,
struct btrfs_path *path,
struct btrfs_path *log_path,
struct inode *dir,
struct btrfs_key *dir_key)
{
struct btrfs_root *root = BTRFS_I(dir)->root;
int ret;
struct extent_buffer *eb;
int slot;
struct btrfs_dir_item *di;
struct fscrypt_str name;
struct inode *inode = NULL;
struct btrfs_key location;
/*
* Currently we only log dir index keys. Even if we replay a log created
* by an older kernel that logged both dir index and dir item keys, all
* we need to do is process the dir index keys, we (and our caller) can
* safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
*/
ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY);
eb = path->nodes[0];
slot = path->slots[0];
di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
if (ret)
goto out;
if (log) {
struct btrfs_dir_item *log_di;
log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
dir_key->objectid,
dir_key->offset, &name, 0);
if (IS_ERR(log_di)) {
ret = PTR_ERR(log_di);
goto out;
} else if (log_di) {
/* The dentry exists in the log, we have nothing to do. */
ret = 0;
goto out;
}
}
btrfs_dir_item_key_to_cpu(eb, di, &location);
btrfs_release_path(path);
btrfs_release_path(log_path);
inode = read_one_inode(root, location.objectid);
if (!inode) {
ret = -EIO;
goto out;
}
ret = link_to_fixup_dir(trans, root, path, location.objectid);
if (ret)
goto out;
inc_nlink(inode);
ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode),
&name);
/*
* Unlike dir item keys, dir index keys can only have one name (entry) in
* them, as there are no key collisions since each key has a unique offset
* (an index number), so we're done.
*/
out:
btrfs_release_path(path);
btrfs_release_path(log_path);
kfree(name.name);
iput(inode);
return ret;
}
static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_root *log,
struct btrfs_path *path,
const u64 ino)
{
struct btrfs_key search_key;
struct btrfs_path *log_path;
int i;
int nritems;
int ret;
log_path = btrfs_alloc_path();
if (!log_path)
return -ENOMEM;
search_key.objectid = ino;
search_key.type = BTRFS_XATTR_ITEM_KEY;
search_key.offset = 0;
again:
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret < 0)
goto out;
process_leaf:
nritems = btrfs_header_nritems(path->nodes[0]);
for (i = path->slots[0]; i < nritems; i++) {
struct btrfs_key key;
struct btrfs_dir_item *di;
struct btrfs_dir_item *log_di;
u32 total_size;
u32 cur;
btrfs_item_key_to_cpu(path->nodes[0], &key, i);
if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
ret = 0;
goto out;
}
di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
total_size = btrfs_item_size(path->nodes[0], i);
cur = 0;
while (cur < total_size) {
u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
u32 this_len = sizeof(*di) + name_len + data_len;
char *name;
name = kmalloc(name_len, GFP_NOFS);
if (!name) {
ret = -ENOMEM;
goto out;
}
read_extent_buffer(path->nodes[0], name,
(unsigned long)(di + 1), name_len);
log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
name, name_len, 0);
btrfs_release_path(log_path);
if (!log_di) {
/* Doesn't exist in log tree, so delete it. */
btrfs_release_path(path);
di = btrfs_lookup_xattr(trans, root, path, ino,
name, name_len, -1);
kfree(name);
if (IS_ERR(di)) {
ret = PTR_ERR(di);
goto out;
}
ASSERT(di);
ret = btrfs_delete_one_dir_name(trans, root,
path, di);
if (ret)
goto out;
btrfs_release_path(path);
search_key = key;
goto again;
}
kfree(name);
if (IS_ERR(log_di)) {
ret = PTR_ERR(log_di);
goto out;
}
cur += this_len;
di = (struct btrfs_dir_item *)((char *)di + this_len);
}
}
ret = btrfs_next_leaf(root, path);
if (ret > 0)
ret = 0;
else if (ret == 0)
goto process_leaf;
out:
btrfs_free_path(log_path);
btrfs_release_path(path);
return ret;
}
/*
* deletion replay happens before we copy any new directory items
* out of the log or out of backreferences from inodes. It
* scans the log to find ranges of keys that log is authoritative for,
* and then scans the directory to find items in those ranges that are
* not present in the log.
*
* Anything we don't find in the log is unlinked and removed from the
* directory.
*/
static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_root *log,
struct btrfs_path *path,
u64 dirid, int del_all)
{
u64 range_start;
u64 range_end;
int ret = 0;
struct btrfs_key dir_key;
struct btrfs_key found_key;
struct btrfs_path *log_path;
struct inode *dir;
dir_key.objectid = dirid;
dir_key.type = BTRFS_DIR_INDEX_KEY;
log_path = btrfs_alloc_path();
if (!log_path)
return -ENOMEM;
dir = read_one_inode(root, dirid);
/* it isn't an error if the inode isn't there, that can happen
* because we replay the deletes before we copy in the inode item
* from the log
*/
if (!dir) {
btrfs_free_path(log_path);
return 0;
}
range_start = 0;
range_end = 0;
while (1) {
if (del_all)
range_end = (u64)-1;
else {
ret = find_dir_range(log, path, dirid,
&range_start, &range_end);
if (ret < 0)
goto out;
else if (ret > 0)
break;
}
dir_key.offset = range_start;
while (1) {
int nritems;
ret = btrfs_search_slot(NULL, root, &dir_key, path,
0, 0);
if (ret < 0)
goto out;
nritems = btrfs_header_nritems(path->nodes[0]);
if (path->slots[0] >= nritems) {
ret = btrfs_next_leaf(root, path);
if (ret == 1)
break;
else if (ret < 0)
goto out;
}
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);
if (found_key.objectid != dirid ||
found_key.type != dir_key.type) {
ret = 0;
goto out;
}
if (found_key.offset > range_end)
break;
ret = check_item_in_log(trans, log, path,
log_path, dir,
&found_key);
if (ret)
goto out;
if (found_key.offset == (u64)-1)
break;
dir_key.offset = found_key.offset + 1;
}
btrfs_release_path(path);
if (range_end == (u64)-1)
break;
range_start = range_end + 1;
}
ret = 0;
out:
btrfs_release_path(path);
btrfs_free_path(log_path);
iput(dir);
return ret;
}
/*
* the process_func used to replay items from the log tree. This
* gets called in two different stages. The first stage just looks
* for inodes and makes sure they are all copied into the subvolume.
*
* The second stage copies all the other item types from the log into
* the subvolume. The two stage approach is slower, but gets rid of
* lots of complexity around inodes referencing other inodes that exist
* only in the log (references come from either directory items or inode
* back refs).
*/
static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
struct walk_control *wc, u64 gen, int level)
{
int nritems;
struct btrfs_tree_parent_check check = {
.transid = gen,
.level = level
};
struct btrfs_path *path;
struct btrfs_root *root = wc->replay_dest;
struct btrfs_key key;
int i;
int ret;
ret = btrfs_read_extent_buffer(eb, &check);
if (ret)
return ret;
level = btrfs_header_level(eb);
if (level != 0)
return 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
nritems = btrfs_header_nritems(eb);
for (i = 0; i < nritems; i++) {
btrfs_item_key_to_cpu(eb, &key, i);
/* inode keys are done during the first stage */
if (key.type == BTRFS_INODE_ITEM_KEY &&
wc->stage == LOG_WALK_REPLAY_INODES) {
struct btrfs_inode_item *inode_item;
u32 mode;
inode_item = btrfs_item_ptr(eb, i,
struct btrfs_inode_item);
/*
* If we have a tmpfile (O_TMPFILE) that got fsync'ed
* and never got linked before the fsync, skip it, as
* replaying it is pointless since it would be deleted
* later. We skip logging tmpfiles, but it's always
* possible we are replaying a log created with a kernel
* that used to log tmpfiles.
*/
if (btrfs_inode_nlink(eb, inode_item) == 0) {
wc->ignore_cur_inode = true;
continue;
} else {
wc->ignore_cur_inode = false;
}
ret = replay_xattr_deletes(wc->trans, root, log,
path, key.objectid);
if (ret)
break;
mode = btrfs_inode_mode(eb, inode_item);
if (S_ISDIR(mode)) {
ret = replay_dir_deletes(wc->trans,
root, log, path, key.objectid, 0);
if (ret)
break;
}
ret = overwrite_item(wc->trans, root, path,
eb, i, &key);
if (ret)
break;
/*
* Before replaying extents, truncate the inode to its
* size. We need to do it now and not after log replay
* because before an fsync we can have prealloc extents
* added beyond the inode's i_size. If we did it after,
* through orphan cleanup for example, we would drop
* those prealloc extents just after replaying them.
*/
if (S_ISREG(mode)) {
struct btrfs_drop_extents_args drop_args = { 0 };
struct inode *inode;
u64 from;
inode = read_one_inode(root, key.objectid);
if (!inode) {
ret = -EIO;
break;
}
from = ALIGN(i_size_read(inode),
root->fs_info->sectorsize);
drop_args.start = from;
drop_args.end = (u64)-1;
drop_args.drop_cache = true;
ret = btrfs_drop_extents(wc->trans, root,
BTRFS_I(inode),
&drop_args);
if (!ret) {
inode_sub_bytes(inode,
drop_args.bytes_found);
/* Update the inode's nbytes. */
ret = btrfs_update_inode(wc->trans,
root, BTRFS_I(inode));
}
iput(inode);
if (ret)
break;
}
ret = link_to_fixup_dir(wc->trans, root,
path, key.objectid);
if (ret)
break;
}
if (wc->ignore_cur_inode)
continue;
if (key.type == BTRFS_DIR_INDEX_KEY &&
wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
ret = replay_one_dir_item(wc->trans, root, path,
eb, i, &key);
if (ret)
break;
}
if (wc->stage < LOG_WALK_REPLAY_ALL)
continue;
/* these keys are simply copied */
if (key.type == BTRFS_XATTR_ITEM_KEY) {
ret = overwrite_item(wc->trans, root, path,
eb, i, &key);
if (ret)
break;
} else if (key.type == BTRFS_INODE_REF_KEY ||
key.type == BTRFS_INODE_EXTREF_KEY) {
ret = add_inode_ref(wc->trans, root, log, path,
eb, i, &key);
if (ret && ret != -ENOENT)
break;
ret = 0;
} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
ret = replay_one_extent(wc->trans, root, path,
eb, i, &key);
if (ret)
break;
}
/*
* We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the
* BTRFS_DIR_INDEX_KEY items which we use to derive the
* BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an
* older kernel with such keys, ignore them.
*/
}
btrfs_free_path(path);
return ret;
}
/*
* Correctly adjust the reserved bytes occupied by a log tree extent buffer
*/
static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
{
struct btrfs_block_group *cache;
cache = btrfs_lookup_block_group(fs_info, start);
if (!cache) {
btrfs_err(fs_info, "unable to find block group for %llu", start);
return;
}
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
cache->reserved -= fs_info->nodesize;
cache->space_info->bytes_reserved -= fs_info->nodesize;
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
btrfs_put_block_group(cache);
}
static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, int *level,
struct walk_control *wc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 bytenr;
u64 ptr_gen;
struct extent_buffer *next;
struct extent_buffer *cur;
u32 blocksize;
int ret = 0;
while (*level > 0) {
struct btrfs_tree_parent_check check = { 0 };
cur = path->nodes[*level];
WARN_ON(btrfs_header_level(cur) != *level);
if (path->slots[*level] >=
btrfs_header_nritems(cur))
break;
bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
check.transid = ptr_gen;
check.level = *level - 1;
check.has_first_key = true;
btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]);
blocksize = fs_info->nodesize;
next = btrfs_find_create_tree_block(fs_info, bytenr,
btrfs_header_owner(cur),
*level - 1);
if (IS_ERR(next))
return PTR_ERR(next);
if (*level == 1) {
ret = wc->process_func(root, next, wc, ptr_gen,
*level - 1);
if (ret) {
free_extent_buffer(next);
return ret;
}
path->slots[*level]++;
if (wc->free) {
ret = btrfs_read_extent_buffer(next, &check);
if (ret) {
free_extent_buffer(next);
return ret;
}
if (trans) {
btrfs_tree_lock(next);
btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
ret = btrfs_pin_reserved_extent(trans,
bytenr, blocksize);
if (ret) {
free_extent_buffer(next);
return ret;
}
btrfs_redirty_list_add(
trans->transaction, next);
} else {
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
clear_extent_buffer_dirty(next);
unaccount_log_buffer(fs_info, bytenr);
}
}
free_extent_buffer(next);
continue;
}
ret = btrfs_read_extent_buffer(next, &check);
if (ret) {
free_extent_buffer(next);
return ret;
}
if (path->nodes[*level-1])
free_extent_buffer(path->nodes[*level-1]);
path->nodes[*level-1] = next;
*level = btrfs_header_level(next);
path->slots[*level] = 0;
cond_resched();
}
path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
cond_resched();
return 0;
}
static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, int *level,
struct walk_control *wc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int i;
int slot;
int ret;
for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
slot = path->slots[i];
if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
path->slots[i]++;
*level = i;
WARN_ON(*level == 0);
return 0;
} else {
ret = wc->process_func(root, path->nodes[*level], wc,
btrfs_header_generation(path->nodes[*level]),
*level);
if (ret)
return ret;
if (wc->free) {
struct extent_buffer *next;
next = path->nodes[*level];
if (trans) {
btrfs_tree_lock(next);
btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
ret = btrfs_pin_reserved_extent(trans,
path->nodes[*level]->start,
path->nodes[*level]->len);
if (ret)
return ret;
btrfs_redirty_list_add(trans->transaction,
next);
} else {
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
clear_extent_buffer_dirty(next);
unaccount_log_buffer(fs_info,
path->nodes[*level]->start);
}
}
free_extent_buffer(path->nodes[*level]);
path->nodes[*level] = NULL;
*level = i + 1;
}
}
return 1;
}
/*
* drop the reference count on the tree rooted at 'snap'. This traverses
* the tree freeing any blocks that have a ref count of zero after being
* decremented.
*/
static int walk_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *log, struct walk_control *wc)
{
struct btrfs_fs_info *fs_info = log->fs_info;
int ret = 0;
int wret;
int level;
struct btrfs_path *path;
int orig_level;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
level = btrfs_header_level(log->node);
orig_level = level;
path->nodes[level] = log->node;
atomic_inc(&log->node->refs);
path->slots[level] = 0;
while (1) {
wret = walk_down_log_tree(trans, log, path, &level, wc);
if (wret > 0)
break;
if (wret < 0) {
ret = wret;
goto out;
}
wret = walk_up_log_tree(trans, log, path, &level, wc);
if (wret > 0)
break;
if (wret < 0) {
ret = wret;
goto out;
}
}
/* was the root node processed? if not, catch it here */
if (path->nodes[orig_level]) {
ret = wc->process_func(log, path->nodes[orig_level], wc,
btrfs_header_generation(path->nodes[orig_level]),
orig_level);
if (ret)
goto out;
if (wc->free) {
struct extent_buffer *next;
next = path->nodes[orig_level];
if (trans) {
btrfs_tree_lock(next);
btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
ret = btrfs_pin_reserved_extent(trans,
next->start, next->len);
if (ret)
goto out;
btrfs_redirty_list_add(trans->transaction, next);
} else {
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
clear_extent_buffer_dirty(next);
unaccount_log_buffer(fs_info, next->start);
}
}
}
out:
btrfs_free_path(path);
return ret;
}
/*
* helper function to update the item for a given subvolumes log root
* in the tree of log roots
*/
static int update_log_root(struct btrfs_trans_handle *trans,
struct btrfs_root *log,
struct btrfs_root_item *root_item)
{
struct btrfs_fs_info *fs_info = log->fs_info;
int ret;
if (log->log_transid == 1) {
/* insert root item on the first sync */
ret = btrfs_insert_root(trans, fs_info->log_root_tree,
&log->root_key, root_item);
} else {
ret = btrfs_update_root(trans, fs_info->log_root_tree,
&log->root_key, root_item);
}
return ret;
}
static void wait_log_commit(struct btrfs_root *root, int transid)
{
DEFINE_WAIT(wait);
int index = transid % 2;
/*
* we only allow two pending log transactions at a time,
* so we know that if ours is more than 2 older than the
* current transaction, we're done
*/
for (;;) {
prepare_to_wait(&root->log_commit_wait[index],
&wait, TASK_UNINTERRUPTIBLE);
if (!(root->log_transid_committed < transid &&
atomic_read(&root->log_commit[index])))
break;
mutex_unlock(&root->log_mutex);
schedule();
mutex_lock(&root->log_mutex);
}
finish_wait(&root->log_commit_wait[index], &wait);
}
static void wait_for_writer(struct btrfs_root *root)
{
DEFINE_WAIT(wait);
for (;;) {
prepare_to_wait(&root->log_writer_wait, &wait,
TASK_UNINTERRUPTIBLE);
if (!atomic_read(&root->log_writers))
break;
mutex_unlock(&root->log_mutex);
schedule();
mutex_lock(&root->log_mutex);
}
finish_wait(&root->log_writer_wait, &wait);
}
static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
struct btrfs_log_ctx *ctx)
{
mutex_lock(&root->log_mutex);
list_del_init(&ctx->list);
mutex_unlock(&root->log_mutex);
}
/*
* Invoked in log mutex context, or be sure there is no other task which
* can access the list.
*/
static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
int index, int error)
{
struct btrfs_log_ctx *ctx;
struct btrfs_log_ctx *safe;
list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
list_del_init(&ctx->list);
ctx->log_ret = error;
}
}
/*
* btrfs_sync_log does sends a given tree log down to the disk and
* updates the super blocks to record it. When this call is done,
* you know that any inodes previously logged are safely on disk only
* if it returns 0.
*
* Any other return value means you need to call btrfs_commit_transaction.
* Some of the edge cases for fsyncing directories that have had unlinks
* or renames done in the past mean that sometimes the only safe
* fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
* that has happened.
*/
int btrfs_sync_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_log_ctx *ctx)
{
int index1;
int index2;
int mark;
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *log = root->log_root;
struct btrfs_root *log_root_tree = fs_info->log_root_tree;
struct btrfs_root_item new_root_item;
int log_transid = 0;
struct btrfs_log_ctx root_log_ctx;
struct blk_plug plug;
u64 log_root_start;
u64 log_root_level;
mutex_lock(&root->log_mutex);
log_transid = ctx->log_transid;
if (root->log_transid_committed >= log_transid) {
mutex_unlock(&root->log_mutex);
return ctx->log_ret;
}
index1 = log_transid % 2;
if (atomic_read(&root->log_commit[index1])) {
wait_log_commit(root, log_transid);
mutex_unlock(&root->log_mutex);
return ctx->log_ret;
}
ASSERT(log_transid == root->log_transid);
atomic_set(&root->log_commit[index1], 1);
/* wait for previous tree log sync to complete */
if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
wait_log_commit(root, log_transid - 1);
while (1) {
int batch = atomic_read(&root->log_batch);
/* when we're on an ssd, just kick the log commit out */
if (!btrfs_test_opt(fs_info, SSD) &&
test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
mutex_unlock(&root->log_mutex);
schedule_timeout_uninterruptible(1);
mutex_lock(&root->log_mutex);
}
wait_for_writer(root);
if (batch == atomic_read(&root->log_batch))
break;
}
/* bail out if we need to do a full commit */
if (btrfs_need_log_full_commit(trans)) {
ret = BTRFS_LOG_FORCE_COMMIT;
mutex_unlock(&root->log_mutex);
goto out;
}
if (log_transid % 2 == 0)
mark = EXTENT_DIRTY;
else
mark = EXTENT_NEW;
/* we start IO on all the marked extents here, but we don't actually
* wait for them until later.
*/
blk_start_plug(&plug);
ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
/*
* -EAGAIN happens when someone, e.g., a concurrent transaction
* commit, writes a dirty extent in this tree-log commit. This
* concurrent write will create a hole writing out the extents,
* and we cannot proceed on a zoned filesystem, requiring
* sequential writing. While we can bail out to a full commit
* here, but we can continue hoping the concurrent writing fills
* the hole.
*/
if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
ret = 0;
if (ret) {
blk_finish_plug(&plug);
btrfs_abort_transaction(trans, ret);
btrfs_set_log_full_commit(trans);
mutex_unlock(&root->log_mutex);
goto out;
}
/*
* We _must_ update under the root->log_mutex in order to make sure we
* have a consistent view of the log root we are trying to commit at
* this moment.
*
* We _must_ copy this into a local copy, because we are not holding the
* log_root_tree->log_mutex yet. This is important because when we
* commit the log_root_tree we must have a consistent view of the
* log_root_tree when we update the super block to point at the
* log_root_tree bytenr. If we update the log_root_tree here we'll race
* with the commit and possibly point at the new block which we may not
* have written out.
*/
btrfs_set_root_node(&log->root_item, log->node);
memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
root->log_transid++;
log->log_transid = root->log_transid;
root->log_start_pid = 0;
/*
* IO has been started, blocks of the log tree have WRITTEN flag set
* in their headers. new modifications of the log will be written to
* new positions. so it's safe to allow log writers to go in.
*/
mutex_unlock(&root->log_mutex);
if (btrfs_is_zoned(fs_info)) {
mutex_lock(&fs_info->tree_root->log_mutex);
if (!log_root_tree->node) {
ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
if (ret) {
mutex_unlock(&fs_info->tree_root->log_mutex);
blk_finish_plug(&plug);
goto out;
}
}
mutex_unlock(&fs_info->tree_root->log_mutex);
}
btrfs_init_log_ctx(&root_log_ctx, NULL);
mutex_lock(&log_root_tree->log_mutex);
index2 = log_root_tree->log_transid % 2;
list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
root_log_ctx.log_transid = log_root_tree->log_transid;
/*
* Now we are safe to update the log_root_tree because we're under the
* log_mutex, and we're a current writer so we're holding the commit
* open until we drop the log_mutex.
*/
ret = update_log_root(trans, log, &new_root_item);
if (ret) {
if (!list_empty(&root_log_ctx.list))
list_del_init(&root_log_ctx.list);
blk_finish_plug(&plug);
btrfs_set_log_full_commit(trans);
if (ret != -ENOSPC) {
btrfs_abort_transaction(trans, ret);
mutex_unlock(&log_root_tree->log_mutex);
goto out;
}
btrfs_wait_tree_log_extents(log, mark);
mutex_unlock(&log_root_tree->log_mutex);
ret = BTRFS_LOG_FORCE_COMMIT;
goto out;
}
if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
blk_finish_plug(&plug);
list_del_init(&root_log_ctx.list);
mutex_unlock(&log_root_tree->log_mutex);
ret = root_log_ctx.log_ret;
goto out;
}
index2 = root_log_ctx.log_transid % 2;
if (atomic_read(&log_root_tree->log_commit[index2])) {
blk_finish_plug(&plug);
ret = btrfs_wait_tree_log_extents(log, mark);
wait_log_commit(log_root_tree,
root_log_ctx.log_transid);
mutex_unlock(&log_root_tree->log_mutex);
if (!ret)
ret = root_log_ctx.log_ret;
goto out;
}
ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
atomic_set(&log_root_tree->log_commit[index2], 1);
if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
wait_log_commit(log_root_tree,
root_log_ctx.log_transid - 1);
}
/*
* now that we've moved on to the tree of log tree roots,
* check the full commit flag again
*/
if (btrfs_need_log_full_commit(trans)) {
blk_finish_plug(&plug);
btrfs_wait_tree_log_extents(log, mark);
mutex_unlock(&log_root_tree->log_mutex);
ret = BTRFS_LOG_FORCE_COMMIT;
goto out_wake_log_root;
}
ret = btrfs_write_marked_extents(fs_info,
&log_root_tree->dirty_log_pages,
EXTENT_DIRTY | EXTENT_NEW);
blk_finish_plug(&plug);
/*
* As described above, -EAGAIN indicates a hole in the extents. We
* cannot wait for these write outs since the waiting cause a
* deadlock. Bail out to the full commit instead.
*/
if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
btrfs_set_log_full_commit(trans);
btrfs_wait_tree_log_extents(log, mark);
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
} else if (ret) {
btrfs_set_log_full_commit(trans);
btrfs_abort_transaction(trans, ret);
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
}
ret = btrfs_wait_tree_log_extents(log, mark);
if (!ret)
ret = btrfs_wait_tree_log_extents(log_root_tree,
EXTENT_NEW | EXTENT_DIRTY);
if (ret) {
btrfs_set_log_full_commit(trans);
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
}