fs/union.c - linux/kernel/git/dhowells/linux-fs - Git at Google

 /* VFS-based union mounts for Linux
  *
  * Copyright (C) 2004-2007 IBM Corporation, IBM Deutschland Entwicklung GmbH.
  * Copyright (C) 2007-2009 Novell Inc.
  * Copyright (C) 2009-2012 Red Hat, Inc.
  *
  *   Author(s): Jan Blunck (j.blunck@tu-harburg.de)
  *              Valerie Aurora <vaurora@redhat.com>
  *              David Howells <dhowells@redhat.com>
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; version 2
  * of the License.
  */
 #define DEBUG
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
 #include <linux/fsnotify.h>
 #include <linux/xattr.h>
 #include <linux/file.h>
 #include <linux/security.h>
 #include <linux/splice.h>

 #include "internal.h"
 #include "union.h"

 /**
  * union_alloc - allocate a union stack
  * @path: path of topmost directory
  *
  * Allocate a union_stack large enough to contain the maximum number
  * of layers in this union mount.
  */
 struct union_stack *union_alloc(struct path *topmost)
 {
 	unsigned int layers = topmost->dentry->d_sb->s_union_count;
 	return kcalloc(sizeof(struct path), layers, GFP_KERNEL);
 }

 /**
  * d_free_unions - free all unions for this dentry
  * @dentry: topmost dentry in the union stack to remove
  *
  * This must be called when freeing a dentry.  d_inode may point to a defunct
  * inode or may have been cleared by the time we get here.
  */
 void d_free_unions(struct dentry *topmost)
 {
 	struct path *path;
 	unsigned int i, layers = topmost->d_sb->s_union_count;

 	if (topmost->d_union_stack) {
 		if (topmost->d_flags & DCACHE_UNION_PINNING_LOWER) {
 			/* A negative non-dir upper dentry is pinning
 			 * a single lower dentry so that f_inode
 			 * doesn't have to.
 			 */
 			printk("free pin: %pq\n", &topmost->d_name);
 			dput(topmost->d_fallthru);
 		} else {
 			/* A positive directory dentry is pinning a
 			 * stack of lower dirs.
 			 */
 			printk("free dirstack: %pq\n", &topmost->d_name);

 			for (i = 0; i < layers; i++) {
 				path = union_find_dir(topmost, i);
 				if (path->mnt)
 					path_put(path);
 			}
 			kfree(topmost->d_union_stack);
 		}
 		topmost->d_union_stack = NULL;
 	}
 }

 /**
  * union_add_dir - Add another layer to a unioned directory
  * @topmost: topmost directory
  * @lower: directory in the current layer
  * @layer: index of layer to add this at
  *
  * @layer counts starting at 0 for the dir below the topmost dir.
  *
  * This transfers the caller's references to the constituents of *lower to the
  * union stack.
  */
 int union_add_dir(struct path *topmost, struct path *lower, unsigned layer)
 {
 	struct dentry *dentry = topmost->dentry;
 	struct path *path;

 	BUG_ON(layer >= dentry->d_sb->s_union_count);
 	BUG_ON(d_is_fallthru(dentry));

 	if (!dentry->d_union_stack)
 		dentry->d_union_stack = union_alloc(topmost);
 	if (!dentry->d_union_stack)
 		return -ENOMEM;

 	path = union_find_dir(dentry, layer);
 	*path = *lower;
 	return 0;
 }

 /**
  * union_copyup_xattr
  * @new: dentry of new copy
  * @old: dentry of original file
  *
  * Copy up extended attributes from the original file to the new one.
  *
  * XXX - Permissions?  For now, copying up every xattr.
  */
 static int union_copyup_xattr(struct path *new, struct dentry *old)
 {
 	ssize_t list_size, size;
 	char *buf, *name, *value;
 	int error;

 	/* Check for xattr support */
 	if (!old->d_inode->i_op->getxattr ||
 	    !new->dentry->d_inode->i_op->getxattr)
 		return 0;

 	/* Find out how big the list of xattrs is */
 	list_size = vfs_listxattr(old, NULL, 0);
 	if (list_size <= 0)
 		return list_size;

 	/* Allocate memory for the list */
 	buf = kzalloc(list_size, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;

 	/* Allocate memory for the xattr's value */
 	error = -ENOMEM;
 	value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL);
 	if (!value)
 		goto out;

 	/* Actually get the list of xattrs */
 	list_size = vfs_listxattr(old, buf, list_size);
 	if (list_size <= 0) {
 		error = list_size;
 		goto out_free_value;
 	}

 	for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
 		/* XXX Locking? old is on read-only fs */
 		size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX);
 		if (size <= 0) {
 			error = size;
 			goto out_free_value;
 		}
 		/* XXX do we really need to check for size overflow? */
 		/* XXX locks new dentry, lock ordering problems? */
 		error = vfs_setxattr(new, name, value, size, 0);
 		if (error)
 			goto out_free_value;
 	}

 out_free_value:
 	kfree(value);
 out:
 	kfree(buf);
 	return error;
 }

 /**
  * union_create_topmost_dir - Create a matching dir in the topmost file system
  * @parent - parent of target on topmost layer
  * @topmost - path of target on topmost layer
  * @lower - path of source on lower layer
  *
  * As we lookup each directory on the lower layer of a union, we create a
  * matching directory on the topmost layer if it does not already exist.
  *
  * We don't use vfs_mkdir() for a few reasons: don't want to do the security
  * check, don't want to make the dir opaque, don't need to sanitize the mode.
  *
  * The caller must hold the parent i_mutex lock and the mnt_want_write lock.
  *
  * XXX - owner is wrong, set credentials properly
  * XXX - rmdir() directory on failure of xattr copyup
  * XXX - not atomic w/ respect to crash
  */
 int union_create_topmost_dir(struct path *parent,
 			     struct path *topmost, struct dentry *lower)
 {
 	struct inode *dir = parent->dentry->d_inode;
 	int mode = lower->d_inode->i_mode;
 	int error;

 	BUG_ON(topmost->dentry->d_inode);

 	/* XXX - Do we even need to check this? */
 	if (!dir->i_op->mkdir)
 		return -EPERM;

 	error = dir->i_op->mkdir(dir, topmost->dentry, mode);
 	if (error)
 		return error;

 	error = union_copyup_xattr(topmost, lower);
 	if (error)
 		goto out_rmdir;

 	fsnotify_mkdir(dir, topmost->dentry);
 	return 0;

 out_rmdir:
 	/* XXX rm created dir */
 	dput(topmost->dentry);
 	return error;
 }

 struct union_iterate_context {
 	struct dir_context ctx;
 	struct dentry *topmost_dentry;
 	int error;
 };

 /**
  * union_copyup_one_dirent - copy up a single directory entry
  *
  * Individual directory entry copyup function for union_copyup_dir.
  * We get the entries from higher level layers first.
  */
 static int union_copyup_one_dirent(void *buf, const char *name, int namelen,
 				   loff_t offset, u64 ino, unsigned int d_type)
 {
 	struct union_iterate_context *uic = (struct union_iterate_context *)buf;
 	struct dentry *topmost_dentry = uic->topmost_dentry;
 	struct dentry *dentry;
 	int err = 0;

 	switch (namelen) {
 	case 2:
 		if (name[1] != '.')
 			break;
 	case 1:
 		if (name[0] != '.')
 			break;
 		return 0;
 	}

 	/* Lookup this entry in the topmost directory */
 	dentry = lookup_one_len(name, topmost_dentry, namelen);

 	if (IS_ERR(dentry)) {
 		printk(KERN_WARNING "%s: error looking up %*.*s\n",
 		       __func__, namelen, namelen, name);
 		err = PTR_ERR(dentry);
 		goto out;
 	}

 	/* XXX do we need to revalidate on readdir anyway? think NFS */
 	if (dentry->d_op && dentry->d_op->d_revalidate)
 		goto fallthru;

 	/* If the entry already exists, one of the following is true: it was
 	 * already copied up (due to an earlier lookup), an entry with the same
 	 * name already exists on the topmost file system, it is a whiteout, or
 	 * it is a fallthru.  In each case, the top level entry masks any
 	 * entries from lower file systems, so don't copy up this entry.
 	 */
 	if (dentry->d_inode || d_is_whiteout(dentry) || d_is_fallthru(dentry))
 		goto out_dput;

 	/* If the entry doesn't exist, create a fallthru entry in the topmost
 	 * file system.  All possible directory types are used, so each file
 	 * system must implement its own way of storing a fallthru entry.
 	 */
 fallthru:
 	err = topmost_dentry->d_inode->i_op->fallthru(topmost_dentry->d_inode,
 						      dentry);

 	/* It's okay if it exists, ultimate responsibility rests with
 	 * ->fallthru() */
 	if (err == -EEXIST)
 		err = 0;
 out_dput:
 	dput(dentry);
 out:
 	if (err)
 		uic->error = err;
 	return err;
 }

 /**
  * __union_copyup_one_dir - Non-recursive directory copy up
  *
  * Copy up the specified directory only, without recursing into the subtree
  * rooted at this point.
  *
  * During the operation, where a directory entry exists in one of the lower
  * directories, a fallthrough dentry will be created in the upper directory if
  * the upper directory doesn't already have an entry that obscures it.  At the
  * end of the operation, the upper directory will be marked opaque on the
  * medium - thus preventing further copy up attempts on this directory.
  *
  * TODO: At some point in the future, on-medium whiteouts should be culled from
  * a directory that is marked opaque as they then serve no purpose.
  *
  * The primary reason for this function is that readdir() is difficult to
  * support on union file systems for two reasons: We must eliminate duplicates
  * and apply whiteouts, and we must return something in f_pos that lets us
  * restart in the same place when we return.  Our solution is to, on first
  * readdir() of the directory, copy up all visible entries from the low-level
  * file systems and mark the entries that refer to low-level file system
  * objects as "fallthrough" entries.
  *
  * Sadly, this function is also necessary for rmdir().  To work out whether a
  * directory is empty, we have to work out if there are entries in lower
  * directories that are not obscured by whiteouts in the upper.  This is not a
  * trivial operation.  The simplest way is, therefore, to copy up and then
  * check the combined opaque directory.
  *
  *
  * Locking strategy: We hold the topmost dir's i_mutex on entry.  We grab the
  * i_mutex on lower directories one by one.  So the locking order is:
  *
  *	Writable/topmost layers > Read-only/lower layers
  *
  * So there is no problem with lock ordering for union stacks with
  * multiple lower layers.  E.g.:
  *
  *	(topmost) A->B->C (bottom)
  *	(topmost) D->C->B (bottom)
  *
  */
 int __union_copyup_one_dir(struct path *topmost_path)
 {
 	struct dentry *topmost_dentry = topmost_path->dentry;
 	unsigned int i, layers = topmost_dentry->d_sb->s_union_count;
 	int error = 0;

 	struct union_iterate_context uic = {
 		.ctx.actor = union_copyup_one_dirent,
 		.topmost_dentry = topmost_dentry,
 	};


 	if (IS_OPAQUE(topmost_dentry->d_inode))
 		return 0;

 	if (!topmost_dentry->d_inode->i_op ||
 	    !topmost_dentry->d_inode->i_op->fallthru)
 		return -EOPNOTSUPP;

 	for (i = 0; i < layers; i++) {
 		struct inode *inode;
 		struct file *ftmp;
 		struct path *path;

 		path = union_find_dir(topmost_dentry, i);
 		if (!path->mnt)
 			continue;

 		ftmp = dentry_open(path, O_RDONLY | O_DIRECTORY | O_NOATIME,
 				   current_cred());
 		if (IS_ERR(ftmp)) {
 			printk(KERN_ERR "unable to open dir %pq for "
 			       "directory copyup: %ld\n",
 			       &path->dentry->d_name, PTR_ERR(ftmp));
 			error = PTR_ERR(ftmp);
 			break;
 		}

 		inode = file_inode(ftmp);
 		mutex_lock(&inode->i_mutex);

 		error = -ENOENT;
 		if (IS_DEADDIR(inode))
 			goto out_fput;

 		/* Read the whole directory, calling our directory entry copyup
 		 * function on each entry.
 		 */
 		uic.ctx.pos = 0;
 		uic.error = 0;
 		error = ftmp->f_op->iterate(ftmp, &uic.ctx);
 out_fput:
 		mutex_unlock(&inode->i_mutex);
 		fput(ftmp);

 		if (uic.error)
 			error = uic.error;
 		if (error)
 			break;

 		/* XXX Should process directories below an opaque directory in
 		 * case there are fallthrus in it
 		 */
 		if (IS_OPAQUE(path->dentry->d_inode))
 			break;
 	}

 	/* Mark this dir opaque to show that we have already copied up the
 	 * lower entries.  Be sure to do this AFTER the directory entries have
 	 * been copied up so that if we crash in the middle of copyup, we will
 	 * try to copyup the dir next time we read it.
 	 *
 	 * XXX - Could leave directory non-opaque, and force reread/copyup of
 	 * directory each time it is read in from disk.  That would make it
 	 * easy to update lower file systems (when not union mounted) and have
 	 * the changes show up when union mounted again.
 	 */
 	if (!error) {
 		topmost_dentry->d_inode->i_flags |= S_OPAQUE;
 		mark_inode_dirty(topmost_dentry->d_inode);
 	}

 	return error;
 }

 /* Relationship between i_mode and the DT_xxx types */
 static inline unsigned char dt_type(struct inode *inode)
 {
 	return (inode->i_mode >> 12) & 15;
 }

 /**
  * generic_readdir_fallthru - Helper to lookup target of a fallthru
  * @topmost_dentry: dentry for the topmost dentry of the dir being read
  * @name: name of fallthru dirent
  * @namelen: length of @name
  * @ino: return inode number of target, if found
  * @d_type: return directory type of target, if found
  *
  * In readdir(), client file systems need to lookup the target of a
  * fallthru in a lower layer for three reasons: (1) fill in d_ino, (2)
  * fill in d_type, (2) make sure there is something to fall through to
  * (and if not, don't return this dentry).  Upon detecting a fallthru
  * dentry in readdir(), the client file system should call this function.
  *
  * Returns 0 on success and -ENOENT if no matching directory entry was
  * found (which can happen when the topmost file system is unmounted
  * and remounted over a different file system than).  Any other errors
  * are unexpected.
  */
 int generic_readdir_fallthru(struct dentry *topmost_dentry, const char *name,
 			     int namlen, ino_t *ino, unsigned char *d_type)
 {
 	struct path *parent;
 	struct dentry *dentry;
 	unsigned int i, layers = topmost_dentry->d_sb->s_union_count;

 	BUG_ON(!mutex_is_locked(&topmost_dentry->d_inode->i_mutex));

 	for (i = 0; i < layers; i++) {
 		parent = union_find_dir(topmost_dentry, i);
 		mutex_lock(&parent->dentry->d_inode->i_mutex);
 		dentry = lookup_one_len(name, parent->dentry, namlen);
 		mutex_unlock(&parent->dentry->d_inode->i_mutex);
 		if (IS_ERR(dentry))
 			return PTR_ERR(dentry);
 		if (dentry->d_inode) {
 			*ino = dentry->d_inode->i_ino;
 			*d_type = dt_type(dentry->d_inode);
 			dput(dentry);
 			return 0;
 		}
 		dput(dentry);
 	}
 	return -ENOENT;
 }
 EXPORT_SYMBOL(generic_readdir_fallthru);

 /*
  * Get the inode and path for a dentry where that inode may exist on a lower
  * layer in a union.
  *
  * The caller must preclear the elements of *_lower_cache and prime *_actual
  * with the contents of *upper (as is done by wrappers in union.h) and must
  * also hold parent->i_mutex.
  *
  * Note that we don't get a ref on the inode or the lower vfsmount (if
  * returned).  We leave it to the caller to iget/mntget them if appropriate.
  * This should be safe as the caller holds parent->i_mutex.  The lower dentry
  * (if returned) is dget'd, however.
  *
  * The pointers returned in *_actual are not dget'd/mntget'd as it is assumed
  * they're pinned by the caller's ref on upper->mnt (if set), upper->dentry; or
  * by the fact that parent->i_mutex is locked and _lower_cache->dentry is
  * dget'd.
  */
 struct inode *__union_get_inode_locked(struct dentry *parent,
 				       struct path *upper,
 				       struct path *_lower_cache,
 				       struct path *_actual)
 {
 	const struct union_stack *d;
 	struct dentry *dentry = upper->dentry;
 	struct path lower;
 	unsigned i, layers = parent->d_sb->s_union_count;
 	int ret;

 	pr_devel("-->%s(%pq,)\n", __func__, &dentry->d_name);

 	BUG_ON(d_is_whiteout(dentry));

 	/* Check for a race with copy up. */
 	if (likely(dentry->d_inode)) {
 		pr_devel("<--%s() = upper\n", __func__);
 		*_actual = *upper;
 		return dentry->d_inode;
 	}

 	if (dentry->d_flags & DCACHE_UNION_PINNING_LOWER) {
 		pr_devel("<--%s() = fall\n", __func__);
 		smp_rmb();
 		_actual->dentry = dentry->d_fallthru;
 		d = parent->d_union_stack;
 		for (i = 0; i < layers; i++) {
 			if (d->u_dirs[i].dentry == dentry->d_fallthru->d_parent) {
 				_lower_cache->mnt = d->u_dirs[i].mnt;
 				break;
 			}
 		}
 		if (unlikely(!_lower_cache->mnt))
 			goto out_badcache;
 		_actual->mnt = mntget(_lower_cache->mnt);
 		return dentry->d_fallthru->d_inode;
 	}

 	/* Search down through the union stack of the parent of the target for
 	 * the lower dentry we're going to use.
 	 */
 	for (i = 0; i < layers; i++) {
 		/* Get the parent directory for this layer and look the target
 		 * up in it.
 		 */
 		const struct path *lower_parent = union_find_dir(parent, i);
 		if (!lower_parent->mnt)
 			continue;

 		mutex_lock(&lower_parent->dentry->d_inode->i_mutex);
 		lower.dentry = __lookup_hash(&dentry->d_name,
 					     lower_parent->dentry, 0);
 		mutex_unlock(&lower_parent->dentry->d_inode->i_mutex);
 		if (IS_ERR(lower.dentry)) {
 			ret = PTR_ERR(lower.dentry);
 			goto out_err;
 		}

 		/* A negative dentry can mean several things: a plain negative
 		 * dentry is ignored and lookup continues to the next layer,
 		 * but a whiteout or a non-fallthru in an opaque dir covers
 		 * everything below it.
 		 */
 		if (!lower.dentry->d_inode) {
 			if (d_is_whiteout(lower.dentry))
 				goto out_hit_barrier;
 			if (IS_OPAQUE(lower_parent->dentry->d_inode) &&
 			    !d_is_fallthru(lower.dentry))
 				goto out_hit_barrier;
 			dput(lower.dentry);
 			continue;
 		}

 		/* TODO: Deal with mountpoints and suchlike */
 		lower.mnt = mntget(lower_parent->mnt);
 		goto out_found_file;
 	}

 out_enoent:
 	if (d_is_fallthru(dentry)) {
 		pr_devel("<--%s() = -ENOENT\n", __func__);
 		return ERR_PTR(-ENOENT);
 	}
 	pr_devel("<--%s() = NULL\n", __func__);
 	return NULL;

 out_hit_barrier:
 	dput(lower.dentry);
 	goto out_enoent;

 out_found_file:
 	*_actual = *_lower_cache = lower;
 	pr_devel("<--%s() = lower\n", __func__);
 	return lower.dentry->d_inode;

 out_err:
 	pr_devel("<--%s() = %d\n", __func__, ret);
 	return ERR_PTR(ret);

 out_badcache:
 	printk_ratelimited(KERN_WARNING "UNION: Bad cached fallthru (%pq/%pq)\n",
 			   &parent->d_name, &upper->dentry->d_name);
 	return ERR_PTR(-EIO);
 }

 /*
  * Get the inode for a dentry where that inode may exist on a lower layer in a
  * union.
  *
  * Note that we don't get a ref on the inode, so we may need to pin it by
  * getting a ref on a dentry pointing to it - in which case, a pointer to that
  * dentry will be returned in *_lower and the caller is expected to dput() the
  * ref on it.
  */
 struct inode *__union_get_inode(struct path *upper, struct path *_lower_cache,
 				struct path *_actual)
 {
 	struct dentry *parent, *dentry = upper->dentry;
 	struct inode *inode;
 	int ret;

 	pr_devel("-->%s(%pq,)\n", __func__, &dentry->d_name);

 	/* We need the parent directory so that we can find the stack of lower
 	 * directories in which to do lookups.  Use the rename mutex to prevent
 	 * rename from getting underfoot whilst we get the parent.
 	 */
 	if (mutex_lock_interruptible(&dentry->d_sb->s_vfs_rename_mutex) < 0)
 		return ERR_PTR(-EINTR);

 	parent = dget_parent(dentry);
 	if (IS_OPAQUE(parent->d_inode) && !d_is_fallthru(dentry)) {
 		mutex_unlock(&dentry->d_sb->s_vfs_rename_mutex);
 		inode = NULL;
 	} else {
 		ret = mutex_lock_interruptible(&parent->d_inode->i_mutex);
 		mutex_unlock(&dentry->d_sb->s_vfs_rename_mutex);
 		if (ret < 0) {
 			inode = ERR_PTR(ret);
 		} else {
 			inode = __union_get_inode_locked(parent, upper,
 							 _lower_cache, _actual);
 			mutex_unlock(&parent->d_inode->i_mutex);
 		}
 	}
 	dput(parent);
 	return inode;
 }

 /**
  * union_create_file
  * @parent: path of the upper parent directory
  * @upper: path of the negative dentry to become new file
  * @lower: path of the source file
  *
  * Must already have mnt_want_write() on the mnt and the parent's i_mutex.
  */
 static int union_create_file(struct path *parent, struct path *upper,
 			     struct path *lower)
 {
 	struct inode *dir = parent->dentry->d_inode;
 	int ret;

 	BUG_ON(!mutex_is_locked(&parent->dentry->d_inode->i_mutex));

 	if (!dir->i_op->tmpfile)
 		return -EPERM;

 	ret = dir->i_op->tmpfile(dir, upper->dentry,
 				 lower->dentry->d_inode->i_mode);
 	if (ret == 0) {
 		spin_lock(&upper->dentry->d_inode->i_lock);
 		upper->dentry->d_inode->i_state |= I_LINKABLE;
 		spin_unlock(&upper->dentry->d_inode->i_lock);
 	}
 	return ret;
 }

 /**
  * union_create_symlink
  * @parent: Upper parent of the symlink
  * @upper: Path of the negative dentry to become new symlink.
  * @lower: Path of the source symlink
  *
  * Must already have mnt_want_write() on the mnt and the parent's i_mutex.
  */
 static int union_create_symlink(struct path *parent, struct path *upper,
 				struct path *lower)
 {
 	struct inode *inode = lower->dentry->d_inode;
 	char *content;
 	int error;

 	BUG_ON(!mutex_is_locked(&parent->dentry->d_inode->i_mutex));

 	content = kmalloc(PATH_MAX + 2, GFP_KERNEL);
 	if (!content)
 		return -ENOMEM;

 	error = inode->i_op->readlink(lower->dentry, content, PATH_MAX + 1);
 	if (error < 0)
 		goto error;
 	content[error] = 0;

 	error = vfs_symlink(parent->dentry->d_inode, upper->dentry, content);
 error:
 	kfree(content);
 	return error;
 }

 /**
  * union_copy_up_data - Copy up len bytes of old's data to new
  * @path: path of target file
  * @actual: path of source file in lower layer
  * @truncate_to: number of bytes to copy (or NULL if all)
  */
 static int union_copy_up_data(struct path *path, struct path *actual,
 			      const loff_t *truncate_to)
 {
 	const struct cred *cred = current_cred();
 	struct file *lower_file;
 	struct file *new_file;
 	loff_t filesize, offset = 0;
 	size_t len;
 	long bytes;
 	int error = 0;

 	filesize = i_size_read(actual->dentry->d_inode);
 	if (truncate_to && *truncate_to < filesize)
 		filesize = *truncate_to;

 	/* Check for overflow of file size */
 	len = filesize;
 	if (len != filesize)
 		return -EFBIG;

 	if (len == 0)
 		return 0;

 	lower_file = dentry_open(actual, O_RDONLY, cred);
 	if (IS_ERR(lower_file))
 		return PTR_ERR(lower_file);

 	new_file = dentry_open(path, O_WRONLY, cred);
 	if (IS_ERR(new_file)) {
 		error = PTR_ERR(new_file);
 		goto out_fput;
 	}

 	bytes = do_splice_direct(lower_file, &offset, new_file, len,
 				 SPLICE_F_MOVE);
 	if (bytes < 0)
 		error = bytes;

 	fput(new_file);
 out_fput:
 	fput(lower_file);
 	return error;
 }

 /*
  * Create a temporary file.  We don't want to inline this as it uses quite a
  * lot of stack space.
  *
  * The caller should make sure _tmpfile->mnt is set to the upper vfsmount and
  * that ->dentry is NULL.
  *
  * Note: we don't return with a ref on _tmpfile->mnt as path is holding a ref.
  * Further, we may return with a dentry in _tmpfile->dentry that needs
  * dput'ing, even if an error occurred.
  */
 static int union_create_tmpfile(struct path *parent, struct path *path,
 				struct path *actual, struct path *_tmpfile)
 {
 	static const struct qstr nameless = { .name = "", .len = 0, .hash = 0 };
 	struct dentry *dentry;
 	int ret;

 	pr_devel("-->%s(%pq)\n",
 		 __func__, &path->dentry->d_name);

 	/* Create a nameless file not directly attached to the parent
 	 * directory, but still associated with it for layout optimisation
 	 * reasons.  The upperfs should check for the file being of zero
 	 * length.
 	 *
 	 * We will then hard link the file into place when we're done copying
 	 * up - and mount/fsck will clean it up in the event of a crash and
 	 * dget() will clean it up in the event of an error.
 	 */
 	mutex_lock(&parent->dentry->d_inode->i_mutex);

 	dentry = d_alloc(parent->dentry, &nameless);
 	if (!IS_ERR(dentry)) {
 		_tmpfile->dentry = dentry;
 		if (S_ISREG(actual->dentry->d_inode->i_mode))
 			ret = union_create_file(parent, _tmpfile, actual);
 		else if (S_ISLNK(actual->dentry->d_inode->i_mode))
 			ret = union_create_symlink(parent, _tmpfile, actual);
 		else
 			BUG();
 	} else {
 		ret = PTR_ERR(dentry);
 	}

 	mutex_unlock(&parent->dentry->d_inode->i_mutex);
 	pr_devel("<--%s() = %d\n", __func__, ret);
 	return ret;
 }

 /**
  * Copy up a file or symlink to a temporary file in the specially prepared
  * directory and return the dentry of that.
  */
 static int union_copy_up_to_tmpfile(struct path *parent, struct path *path,
 				    struct path *actual, struct path *_tmpfile,
 				    const loff_t *truncate_to)
 {
 	struct dentry *dentry = actual->dentry;
 	int ret;

 	ret = union_create_tmpfile(parent, path, actual, _tmpfile);

 	if (ret == 0 && S_ISREG(dentry->d_inode->i_mode))
 		ret = union_copy_up_data(_tmpfile, actual, truncate_to);
 	if (ret == 0)
 		ret = union_copyup_xattr(_tmpfile, actual->dentry);
 	return ret;
 }

 /*
  * Create a hardlink from the temporary file to the actual location.
  */
 static int union_hard_link_to_tmpfile(struct path *parent, struct path *path,
 				      struct path *tmpfile)
 {
 	int ret;

 	pr_devel("-->%s(%pq,%pq,%pq)\n",
 		 __func__, &parent->dentry->d_name, &path->dentry->d_name,
 		 &tmpfile->dentry->d_name);

 	mutex_lock(&parent->dentry->d_inode->i_mutex);
 	ret = vfs_link(tmpfile->dentry, parent->dentry->d_inode, path->dentry);
 	mutex_unlock(&parent->dentry->d_inode->i_mutex);
 	return ret;
 }

 /**
  * union_copy_up_via_tmpfile - Copy up lower file via temporary file
  *
  * Copy up a file or symlink to a temporary file in the specially prepared
  * directory, then hard link across and unlink the temp file.
  */
 static int union_copy_up_via_tmpfile(struct path *parent, struct path *path,
 				     struct path *actual, const loff_t *truncate_to)
 {
 	const struct cred *saved_cred;
 	struct cred *override_cred;
 	struct path tmpfile = { .mnt = path->mnt, .dentry = NULL };
 	int ret;

 	pr_devel("-->%s(,%pq,%pq,%pq,,%lld)\n",
 		 __func__, &parent->dentry->d_name, &path->dentry->d_name,
 		 &actual->dentry->d_name, truncate_to ? *truncate_to : -1);

 	override_cred = prepare_kernel_cred(NULL);
 	if (!override_cred)
 		return -ENOMEM;

 	override_cred->fsuid = actual->dentry->d_inode->i_uid;
 	override_cred->fsgid = actual->dentry->d_inode->i_gid;

 	saved_cred = override_creds(override_cred);

 	ret = union_copy_up_to_tmpfile(parent, path, actual, &tmpfile,
 				       truncate_to);

 	if (ret == 0)
 		ret = union_hard_link_to_tmpfile(parent, path, &tmpfile);

 	/* Discard the temporary dentry */
 	dput(tmpfile.dentry);

 	revert_creds(saved_cred);

 	put_cred(override_cred);
 	pr_devel("<--%s() = %d\n", __func__, ret);
 	return ret;
 }

 /*
  * Make copy-up an exclusive operation on a file.  The caller must have the
  * parent i_mutex locked - which we will unlock during this function.
  */
 static int __union_copy_up_exclusive(struct path *parent, struct path *path,
 				     struct path *actual, const loff_t *truncate_to)
 	__releases(parent->dentry->d_inode->i_mutex)
 {
 	struct dentry *upper = path->dentry;
 	int ret;

 	spin_lock(&upper->d_lock);
 	if (upper->d_flags & DCACHE_UNION_COPYING_UP) {
 		/* Copy up already in progress */
 		spin_unlock(&upper->d_lock);
 		mutex_unlock(&parent->dentry->d_inode->i_mutex);
 		pr_devel("UNION: wait on copyup\n");

 		/* Abuse the bit-wait system to get hold of a waitqueue we can
 		 * use (d_flags may be smaller than an unsigned long).
 		 */
 		do {
 			wait_queue_head_t *wq =
 				bit_waitqueue(&upper->d_flags, ilog2(DCACHE_UNION_COPYING_UP));
 			DEFINE_WAIT(__wait);

 			ret = -EAGAIN;
 			for (;;) {
 				prepare_to_wait(wq, &__wait, TASK_INTERRUPTIBLE);
 				if (!(upper->d_flags & DCACHE_UNION_COPYING_UP))
 					break;
 				if (!signal_pending(current)) {
 					schedule();
 					continue;
 				}
 				ret = -ERESTARTSYS;
 				break;
 			}
 			finish_wait(wq, &__wait);
 		} while (0);
 		return ret; /* There might have been an error or a signal */
 	}

 	/* Commence copying up.
 	 *
 	 * Mark the dentry so that other potential copy-uppers will wait for us
 	 * and drop the locks so that we can use splice.
 	 */
 	upper->d_flags |= DCACHE_UNION_COPYING_UP;
 	spin_unlock(&upper->d_lock);
 	mutex_unlock(&parent->dentry->d_inode->i_mutex);

 	pr_devel("UNION: copyup begin\n");
 	ret = union_copy_up_via_tmpfile(parent, path, actual, truncate_to);
 	pr_devel("UNION: copyup done\n");

 	spin_lock(&upper->d_lock);
 	upper->d_flags &= ~DCACHE_UNION_COPYING_UP;
 	spin_unlock(&upper->d_lock);

 	wake_up_bit(&upper->d_flags, ilog2(DCACHE_UNION_COPYING_UP));
 	return 0;
 }

 /**
  * __union_copy_up - Copy a non-directory file up to the upper layer.
  */
 int __union_copy_up(struct path *path, struct path *actual, const loff_t *truncate_to)
 {
 	struct dentry *upper = path->dentry;
 	struct path parent;
 	int ret;

 	pr_devel("-->%s(%pq)\n", __func__, &path->dentry->d_name);

 	/* We don't currently support copyup of special files, though in theory
 	 * there's no reason we couldn't at least copy up blockdev and chrdev
 	 * files.  FIFO files are problematic if open.  Socket files are
 	 * managed by AF_UNIX and would need help from there.  Directories are
 	 * handled by pathwalk.
 	 */
 	if (!S_ISREG(actual->dentry->d_inode->i_mode) &&
 	    !S_ISLNK(actual->dentry->d_inode->i_mode))
 		return -EACCES;

 	parent.mnt = path->mnt;
 	do {
 		/* We need to get the parent directory and then we need to lock
 		 * it.  Use the rename mutex to prevent rename from getting
 		 * underfoot whilst we do this.
 		 */
 		if (mutex_lock_interruptible(&upper->d_sb->s_vfs_rename_mutex) < 0)
 			return -EINTR;

 		if (upper->d_inode) {
 			mutex_unlock(&upper->d_sb->s_vfs_rename_mutex);
 			goto already_copied_up;
 		}

 		parent.dentry = dget_parent(upper);
 		BUG_ON(IS_OPAQUE(parent.dentry->d_inode) && !d_is_fallthru(upper));
 		BUG_ON(d_is_whiteout(upper));

 		ret = mutex_lock_interruptible(&parent.dentry->d_inode->i_mutex);
 		mutex_unlock(&upper->d_sb->s_vfs_rename_mutex);
 		if (ret < 0) {
 			dput(parent.dentry);
 			goto out;
 		}

 		if (upper->d_inode)
 			goto already_copied_up_unlock;

 		/* Do the copy up (unlocks the parent) */
 		ret = __union_copy_up_exclusive(&parent, path, actual, truncate_to);
 		dput(parent.dentry);
 	} while (ret == -EAGAIN);

 out:
 	pr_devel("<--%s() = %d\n", __func__, ret);
 	return ret;

 already_copied_up_unlock:
 	mutex_unlock(&parent.dentry->d_inode->i_mutex);
 	dput(parent.dentry);
 already_copied_up:
 	pr_devel("<--%s() = 0 [already done]\n", __func__);
 	*actual = *path;
 	return 0;
 }

 /*
  * Copy up a file for do last.  This gives us the parent, but we still
  * need to work out the lower dentry.
  */
 int __union_copy_up_for_do_last(struct path *parent, struct path *path,
 				bool will_truncate)
 {
 	struct path lower_cache, actual;
 	struct inode *inode;
 	loff_t zero = 0;
 	int ret;

 	pr_devel("-->%s(%pq)\n", __func__, &path->dentry->d_name);

 	do {
 		ret = mutex_lock_interruptible(&parent->dentry->d_inode->i_mutex);
 		if (ret < 0)
 			return ret;

 		/* Check to see if we raced with another copy-up or an unlink */
 		ret = 0;
 		if (path->dentry->d_parent != parent->dentry ||
 		    path->dentry->d_inode)
 			goto unlock_out;

 		inode = union_get_inode_locked(parent->dentry, path,
 					       &lower_cache, &actual);
 		if (IS_ERR(inode)) {
 			ret = PTR_ERR(inode);
 			goto unlock_out;
 		}

 		/* Do the copy up (unlocks the parent). */
 		ret = __union_copy_up_exclusive(parent, path, &actual,
 						will_truncate ? &zero : 0);
 		path_put_maybe(&lower_cache);
 	} while (ret == -EAGAIN);

 	pr_devel("<--%s() = %d [post]\n", __func__, ret);
 	return ret;

 unlock_out:
 	mutex_unlock(&parent->dentry->d_inode->i_mutex);
 	pr_devel("<--%s() = %d [pre]\n", __func__, ret);
 	return ret;
 }