Development
diff --git a/fs/Makefile b/fs/Makefile
index 4fe6df3..65b25e3 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -51,6 +51,7 @@
 obj-$(CONFIG_FHANDLE)		+= fhandle.o
 
 obj-y				+= quota/
+obj-$(CONFIG_UNION_MOUNT)	+= union.o
 
 obj-$(CONFIG_PROC_FS)		+= proc/
 obj-$(CONFIG_SYSFS)		+= sysfs/
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index ca49f47..9fea4b5 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -391,7 +391,8 @@
 	path.dentry = dir;
 	path_to_graveyard.mnt = cache->mnt;
 	path_to_graveyard.dentry = cache->graveyard;
-	ret = security_path_rename(&path, rep, &path_to_graveyard, grave);
+	ret = security_path_rename(&path, rep, &path_to_graveyard, grave,
+				   rep->d_inode);
 	if (ret < 0) {
 		cachefiles_io_error(cache, "Rename security error %d", ret);
 	} else {
diff --git a/fs/dcache.c b/fs/dcache.c
index 6e9895a..bf9071b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -40,6 +40,7 @@
 #include <linux/list_lru.h>
 #include "internal.h"
 #include "mount.h"
+#include "union.h"
 
 /*
  * Usage:
@@ -445,6 +446,7 @@
 	if (parent)
 		spin_unlock(&parent->d_lock);
 	dentry_iput(dentry);
+	d_free_unions(dentry);
 	/*
 	 * dentry_iput drops the locks, at which point nobody (except
 	 * transient RCU lookups) can reach this dentry.
@@ -1522,6 +1524,9 @@
 	dentry->d_sb = sb;
 	dentry->d_op = NULL;
 	dentry->d_fsdata = NULL;
+#ifdef CONFIG_UNION_MOUNT
+	dentry->d_union_stack = NULL;
+#endif
 	INIT_HLIST_BL_NODE(&dentry->d_hash);
 	INIT_LIST_HEAD(&dentry->d_lru);
 	INIT_LIST_HEAD(&dentry->d_subdirs);
@@ -2384,6 +2389,7 @@
 		}
 		dentry->d_flags &= ~DCACHE_CANT_MOUNT;
 		dentry_unlink_inode(dentry);
+		d_free_unions(dentry);
 		fsnotify_nameremove(dentry, isdir);
 		return;
 	}
@@ -2393,6 +2399,12 @@
 
 	spin_unlock(&dentry->d_lock);
 
+	/* Remove any associated unions.  While someone still has this
+	 * directory open (ref count > 0), we could not have deleted it unless
+	 * it was empty, and therefore has no references to directories below
+	 * it.  So we don't need the unions.
+	 */
+	d_free_unions(dentry);
 	fsnotify_nameremove(dentry, isdir);
 }
 EXPORT_SYMBOL(d_delete);
diff --git a/fs/inode.c b/fs/inode.c
index 8377d55..bb439c0 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1553,6 +1553,11 @@
 	struct inode *inode = path->dentry->d_inode;
 	struct timespec now;
 
+#ifdef CONFIG_UNION_MOUNT
+	if (unlikely(!inode))
+		return;
+#endif
+
 	if (inode->i_flags & S_NOATIME)
 		return;
 	if (IS_NOATIME(inode))
diff --git a/fs/internal.h b/fs/internal.h
index 4657424..6ab8541 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -46,6 +46,8 @@
 extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *);
 extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
 			   const char *, unsigned int, struct path *);
+extern int sb_permission(struct super_block *, struct inode *, int);
+extern struct dentry *__lookup_hash(struct qstr *, struct dentry *, unsigned);
 
 /*
  * namespace.c
diff --git a/fs/libfs.c b/fs/libfs.c
index c8360bc..2f9b458 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -153,6 +153,7 @@
 	struct list_head *p, *q = &cursor->d_u.d_child;
 	ino_t ino;
 	char d_type;
+	int err = 0;
 
 	if (!dir_emit_dots(file, ctx))
 		return 0;
@@ -172,9 +173,13 @@
 		spin_unlock(&next->d_lock);
 		spin_unlock(&dentry->d_lock);
 		if (d_is_fallthru(next)) {
-			/* XXX placeholder until generic_readdir_fallthru() arrives */
-			ino = 1;
-			d_type = DT_UNKNOWN;
+			/* On tmpfs, should only fail with ENOMEM, EIO, etc. */
+			err = generic_readdir_fallthru(file->f_path.dentry,
+						       next->d_name.name,
+						       next->d_name.len,
+						       &ino, &d_type);
+			if (err)
+				return err;
 		} else {
 			ino = next->d_inode->i_ino;
 			d_type = dt_type(next->d_inode);
diff --git a/fs/namei.c b/fs/namei.c
index a1b97c9..d87aa8d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -38,6 +38,7 @@
 
 #include "internal.h"
 #include "mount.h"
+#include "union.h"
 
 /* [Feb-1997 T. Schoebel-Theuer]
  * Fundamental changes in the pathname lookup mechanisms (namei)
@@ -411,7 +412,7 @@
  *
  * Separate out file-system wide checks from inode-specific permission checks.
  */
-static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
+int sb_permission(struct super_block *sb, struct inode *inode, int mask)
 {
 	if (unlikely(mask & MAY_WRITE)) {
 		umode_t mode = inode->i_mode;
@@ -585,6 +586,7 @@
 static int complete_walk(struct nameidata *nd)
 {
 	struct dentry *dentry = nd->path.dentry;
+	struct inode *inode;
 	int status;
 
 	if (nd->flags & LOOKUP_RCU) {
@@ -607,6 +609,8 @@
 			mntput(nd->path.mnt);
 			return -ECHILD;
 		}
+		inode = d_inode_or_lower(dentry);
+		BUG_ON(nd->inode != inode);
 		rcu_read_unlock();
 	}
 
@@ -683,9 +687,16 @@
 
 static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
 {
-	struct inode *inode = link->dentry->d_inode;
+	struct dentry *dentry = link->dentry;
+	struct inode *inode;
+
+	/* If the link was on the lower layer of a union when we started
+	 * following it, then follow_link() must have updated link->dentry to
+	 * point to that.
+	 */
+	inode = dentry->d_inode;
 	if (inode->i_op->put_link)
-		inode->i_op->put_link(link->dentry, nd, cookie);
+		inode->i_op->put_link(dentry, nd, cookie);
 	path_put(link);
 }
 
@@ -718,6 +729,10 @@
 
 	/* Allowed if owner and follower match. */
 	inode = link->dentry->d_inode;
+#ifdef CONFIG_UNION_MOUNT
+	if (!inode)
+		inode = link->dentry->d_fallthru->d_inode;
+#endif
 	if (uid_eq(current_cred()->fsuid, inode->i_uid))
 		return 0;
 
@@ -785,21 +800,19 @@
  */
 static int may_linkat(struct path *link)
 {
-	const struct cred *cred;
-	struct inode *inode;
+	struct inode *inode = link->dentry->d_inode;
 
 	if (!sysctl_protected_hardlinks)
 		return 0;
 
-	cred = current_cred();
-	inode = link->dentry->d_inode;
-
 	/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
 	 * otherwise, it must be a safe source.
 	 */
-	if (uid_eq(cred->fsuid, inode->i_uid) || safe_hardlink_source(inode) ||
-	    capable(CAP_FOWNER))
+	if (uid_eq(current_cred()->fsuid, inode->i_uid) ||
+	    safe_hardlink_source(inode) ||
+	    capable(CAP_FOWNER)) {
 		return 0;
+	}
 
 	audit_log_link_denied("linkat", link);
 	return -EPERM;
@@ -827,7 +840,20 @@
 	touch_atime(link);
 	nd_set_link(nd, NULL);
 
-	error = security_inode_follow_link(link->dentry, nd);
+#ifdef CONFIG_UNION_MOUNT
+	if (unlikely(!dentry->d_inode)) {
+		/* If the link is on the lower layer of a union, then we need
+		 * to save this fact so that put_link() can call the correct
+		 * ->put_link() op if the link gets copied whilst we're using
+		 * it.
+		 */
+		link->dentry = dentry->d_fallthru;
+		dput(dentry);
+		dentry = dget(link->dentry);
+	}
+#endif
+
+	error = security_inode_follow_link(dentry, nd);
 	if (error)
 		goto out_put_nd_path;
 
@@ -1257,6 +1283,378 @@
 }
 
 /*
+ * Inspect the lower layers of a potentially unioned file and appropriately
+ * annotate the upper dentry.  Returns:
+ *
+ * (*) 0 if encountered a dir first - the union stack will be filled in, but
+ *     will not be attached to the dentry.  The caller must create the top
+ *     dentry first and only then attach it.
+ *
+ * (*) -ENOTDIR if encountered a symlink first - and upper d_fallthru will be
+ *	set to point to the lower symlink and DCACHE_SYMLINK will be set.
+ *
+ * (*) -ENOTDIR if we encountered any other type of file first.
+ *
+ * (*) -ENOENT if we didn't encounter anything.
+ *
+ * The caller must hold i_mutex on the parent dir.
+ */
+static int union_annotate_dentry(struct path *parent, struct path *path,
+				 struct union_stack *d)
+{
+	struct dentry *dentry = path->dentry, *lower = NULL;
+	unsigned flags, i, layers = parent->dentry->d_sb->s_union_count;
+	int ret = -ENOENT;
+
+	printk("UNION: -->union_annotate_dentry(%pd/%pd {%x})\n",
+	       parent->dentry, dentry, dentry->d_flags);
+
+	BUG_ON(dentry->d_flags & DCACHE_UNION_PINNING_LOWER);
+
+	if (d_is_whiteout(dentry)) {
+		spin_lock(&dentry->d_lock);
+		BUG_ON(!d_is_whiteout(dentry));
+		dentry->d_flags |= DCACHE_UNION_LOOKUP_DONE;
+		spin_unlock(&dentry->d_lock);
+		return -ENOENT;
+	}
+
+	if (IS_OPAQUE(parent->dentry->d_inode) && !d_is_fallthru(dentry)) {
+		spin_lock(&dentry->d_lock);
+		dentry->d_flags |= DCACHE_UNION_LOOKUP_DONE;
+		spin_unlock(&dentry->d_lock);
+		return -ENOENT;
+	}
+
+	for (i = 0; i < layers; i++) {
+		/* Get the parent directory for this layer and lookup
+		 * the target in it.
+		 */
+		struct path *lower_parent = union_find_dir(parent->dentry, i);
+		if (!lower_parent->mnt)
+			continue;
+
+		mutex_lock(&lower_parent->dentry->d_inode->i_mutex);
+		lower = __lookup_hash(&dentry->d_name, lower_parent->dentry, 0);
+		mutex_unlock(&lower_parent->dentry->d_inode->i_mutex);
+
+		if (IS_ERR(lower)) {
+			ret = PTR_ERR(lower);
+			goto error_no_dput;
+		}
+
+		/* A negative dentry can mean several things: a plain negative
+		 * dentry is ignored and lookup continues to the next layer,
+		 * but a whiteout or a non-fallthru in an opaque dir covers
+		 * everything below it.
+		 */
+		if (!lower->d_inode) {
+			if (d_is_whiteout(lower))
+				goto found_blocker;
+			if (!d_is_fallthru(lower) &&
+			    IS_OPAQUE(lower_parent->dentry->d_inode))
+				goto found_blocker;
+			dput(lower);
+			lower = NULL;
+			continue;
+		}
+
+		/* Non-directories block everything below them.  Special case:
+		 * If we find a file below a directory (which makes no sense),
+		 * just ignore the file and return the directory above it.
+		 */
+		if (!d_is_directory(lower)) {
+			if (ret != -ENOENT)
+				break;
+			goto found_nondir_first;
+		}
+
+		printk("UNION: layer %u is dir\n", i);
+
+		/* Mountpoints and automount points on a lowerfs just confuse
+		 * everything, so refuse to handle them for the moment.
+		 */
+		if (unlikely(d_mountpoint(lower))) {
+			if (ret == -ENOENT)
+				ret = -EXDEV;
+			goto error_dput;
+		}
+		if (unlikely(d_managed(lower))) {
+			if (ret == -ENOENT)
+				ret = -EREMOTE;
+			goto error_dput;
+		}
+
+		d->u_dirs[i].dentry = lower;
+		d->u_dirs[i].mnt = mntget(lower_parent->mnt);
+		lower = NULL;
+		ret = 0;
+	}
+
+	/* We may have found a lower directory at this point.  If we did, we
+	 * don't annotate the dentry, but rather leave that to the caller to do
+	 * when creating the upper directory.
+	 *
+	 * If there was nothing underneath, then annotate the top as being
+	 * negative.
+	 */
+	if (ret == -ENOENT) {
+		printk("UNION: Nothing underneath\n");
+		flags = DCACHE_MISS_TYPE | DCACHE_UNION_LOOKUP_DONE;
+		goto set_negative;
+	}
+found_directory:
+	dput(lower);
+	return 0;
+
+	/* We found a blocking dentry in the lower levels so we mark the top
+	 * dentry as blocking too.  The whiteout/negative type is propagated
+	 * upwards.
+	 */
+found_blocker:
+	printk("UNION: Found opaque/whiteout first\n");
+	if (ret == 0)
+		goto found_directory;
+	flags = __d_entry_type(lower) | DCACHE_UNION_LOOKUP_DONE;
+	dput(lower);
+set_negative:
+	spin_lock(&dentry->d_lock);
+	if (!(dentry->d_flags & DCACHE_UNION_LOOKUP_DONE)) {
+		BUG_ON(!d_is_miss(dentry));
+		dentry->d_flags |= flags;
+	}
+	spin_unlock(&dentry->d_lock);
+	return -ENOENT;
+
+	/* A dentry that covers a lower file of any type is flagged and given a
+	 * reference to the underlying file to hold.  We do the attachment here
+	 * so as not to have to pass the lower dentry back to the caller.
+	 */
+found_nondir_first:
+	printk("UNION: Found non-dir first\n");
+	flags = __d_entry_type(lower) | DCACHE_UNION_LOOKUP_DONE;
+	spin_lock(&dentry->d_lock);
+	BUG_ON(!d_is_miss(dentry));
+	if (!(dentry->d_flags & DCACHE_UNION_LOOKUP_DONE)) {
+		d_pin_lower(dentry, lower);
+		dentry->d_flags |= flags;
+		printk("UNION: pin lower %x\n", dentry->d_flags);
+		lower = NULL;
+	} else {
+		printk("UNION: lower already pinned\n");
+	}
+	spin_unlock(&dentry->d_lock);
+	dput(lower);
+	return -ENOTDIR;
+
+error_dput:
+	dput(lower);
+error_no_dput:
+	return ret;
+}
+
+/**
+ * __union_lookup_point_locked - Look up the current dentry in lower layers under lock
+ * @parent: The parent of @path
+ * @path: Path of the target on the upper file system
+ *
+ * The caller must be holding the parent dir's i_mutex and must have locked the
+ * mount point for write.
+ */
+static int __union_lookup_point_locked(struct path *parent, struct path *path)
+{
+	struct union_stack *d;
+	struct dentry *dentry = path->dentry;
+	int ret;
+
+	printk("UNION: -->__union_lookup_point_locked(%pd/%pd)\n",
+	       parent->dentry, dentry);
+
+	d = union_alloc_stack(path);
+	if (!d)
+		return -ENOMEM;
+
+	ret = union_annotate_dentry(parent, path, d);
+	if (ret < 0) {
+		if (ret == -ENOTDIR)
+			ret = 0;
+		goto out_kill_stack;
+	}
+
+	/* It's a directory, so it must be created on the upper level */
+	printk("UNION: May need to create dir\n");
+
+	ret = union_create_topmost_dir(parent, path, d);
+	if (ret < 0)
+		goto out_kill_stack;
+
+	spin_lock(&dentry->d_lock);
+	d_set_union_stack(dentry, d);
+	dentry->d_flags |= DCACHE_UNION_LOOKUP_DONE;
+	spin_unlock(&dentry->d_lock);
+	ret = 0;
+out:
+	printk("UNION: <--__union_lookup_point_locked() = %d\n", ret);
+	return ret;
+
+out_kill_stack:
+	union_free(parent, d);
+	goto out;
+}
+
+static int union_lookup_point_locked(struct path *parent, struct path *path)
+{
+	if (!IS_PATH_UNIONED(parent) ||
+	    path->dentry->d_flags & DCACHE_UNION_LOOKUP_DONE)
+		return 0;
+
+	return __union_lookup_point_locked(parent, path);
+}
+
+/**
+ * __union_lookup_point - Look up the current point, raising a dir to upper level
+ * @parent: The parent of @path
+ * @path: Path of the target on the upper file system
+ * @got_write: The caller is holding antifreeze on the upper mount.
+ */
+static int __union_lookup_point(struct path *parent, struct path *path,
+				bool got_write)
+{
+	struct union_stack *d;
+	struct dentry *dentry = path->dentry;
+	struct inode *dir = parent->dentry->d_inode;
+	int ret;
+
+	printk("UNION: -->__union_lookup_point(%pd/%pd)\n",
+	       parent->dentry, path->dentry);
+
+	d = union_alloc_stack(path);
+	if (!d)
+		return -ENOMEM;
+
+	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	if (dentry->d_flags & DCACHE_UNION_LOOKUP_DONE) {
+		printk("UNION: already (1)\n");
+		ret = 0;
+		goto out_unlock_mutex;
+	}
+
+	ret = union_annotate_dentry(parent, path, d);
+	mutex_unlock(&dir->i_mutex);
+
+	if (ret < 0) {
+		if (ret == -ENOTDIR) {
+			printk("UNION: sym/file\n");
+			ret = 0;
+		}
+		goto out;
+	}
+
+	/* It's a directory, so it must be raised to the upper level.  However,
+	 * we had to drop the parent lock so that we can take the locks in the
+	 * right order.
+	 */
+	printk("UNION: May need to raise dir\n");
+	if (!got_write) {
+		ret = mnt_want_write(parent->mnt);
+		if (ret < 0)
+			goto out;
+	}
+
+	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+
+	if (!(dentry->d_flags & DCACHE_UNION_LOOKUP_DONE)) {
+		printk("UNION: Need to raise dir\n");
+		ret = union_create_topmost_dir(parent, path, d);
+		if (ret == 0) {
+			spin_lock(&dentry->d_lock);
+			d_set_union_stack(dentry, d);
+			dentry->d_flags |= DCACHE_UNION_LOOKUP_DONE;
+			spin_unlock(&dentry->d_lock);
+			d = NULL;
+		}
+	} else {
+		printk("UNION: already (2)\n");
+		ret = 0;
+	}
+
+	if (!got_write)
+		mnt_drop_write(parent->mnt);
+out_unlock_mutex:
+	mutex_unlock(&dir->i_mutex);
+out:
+	union_free(parent, d);
+	printk("UNION: <--__union_lookup_point() = %d\n", ret);
+	return ret;
+}
+
+static int union_lookup_point(struct nameidata *nd, struct path *path,
+			      bool got_write)
+{
+	if (!IS_PATH_UNIONED(&nd->path) ||
+	    path->dentry->d_flags & DCACHE_UNION_LOOKUP_DONE)
+		return 0;
+
+	if (nd->flags & LOOKUP_RCU) {
+		printk("UNION: unlazy for union_lookup_point()\n");
+		if (unlikely(unlazy_walk(nd, path->dentry)))
+			return -ECHILD;
+	}
+
+	return __union_lookup_point(&nd->path, path, got_write);
+}
+
+/*
+ * lookup_union_rcu - Handle union mounted dentries in RCU-walk mode
+ * @parent: The parent directory.
+ * @path: The point just looked up in @parent.
+ * @inode: The inode at @dentry (*@inode is NULL if negative dentry).
+ *
+ * Handle a dentry that represents a non-directory file or a hole/reference in
+ * a union mount upperfs
+ *
+ * We return true if we don't need to do anything or if we've successfully
+ * updated the path.  If we need to drop out of RCU-walk and go to refwalk
+ * mode, we return false.
+ */
+static bool lookup_union_rcu(struct path *parent,
+			     struct path *path,
+			     struct inode **inode)
+{
+	struct dentry *dentry = path->dentry;
+
+	/* Handle non-unionmount dentries first. */
+	if (likely(!IS_PATH_UNIONED(parent)))
+		return true;
+
+	printk("UNION: Dir is unioned (RCU)\n");
+
+	/* If it's positive then no further lookup is needed: the file or
+	 * directory has been copied up and the user gets to play with that.
+	 */
+	if (*inode)
+		return true;
+
+	/* If this dentry is a blocker, then stop here. */
+	if (d_is_negative(dentry))
+		return true;
+
+	/* If we need to look below, then we should break out of RCU walk mode
+	 * with immediate effect.  There are three cases:
+	 *
+	 * (1) We've encountered a lower directory.  This must be copied up.
+	 *
+	 * (2) We've encountered a symlink.  Symlinks are walked in refwalk
+	 *     mode (or (3) applies if NOFOLLOW).
+	 *
+	 * (3) We've encountered some other type of file.  This must terminate
+	 *     the pathwalk immediately, one way or another.
+	 */
+	printk("UNION: Drop out of RCU\n");
+	return false;
+}
+
+/*
  * This looks up the name in dcache, possibly revalidates the old dentry and
  * allocates a new one if not found or not valid.  In the need_lookup argument
  * returns whether i_op->lookup is necessary.
@@ -1321,7 +1719,7 @@
 	return dentry;
 }
 
-static struct dentry *__lookup_hash(struct qstr *name,
+struct dentry *__lookup_hash(struct qstr *name,
 		struct dentry *base, unsigned int flags)
 {
 	bool need_lookup;
@@ -1339,8 +1737,8 @@
  *  small and for now I'd prefer to have fast path as straight as possible.
  *  It _is_ time-critical.
  */
-static int lookup_fast(struct nameidata *nd,
-		       struct path *path, struct inode **inode)
+static noinline int lookup_fast(struct nameidata *nd,
+		       struct path *path, struct inode **_inode)
 {
 	struct vfsmount *mnt = nd->path.mnt;
 	struct dentry *dentry, *parent = nd->path.dentry;
@@ -1348,22 +1746,29 @@
 	int status = 1;
 	int err;
 
+	if (IS_PATH_UNIONED(&nd->path))
+		printk("UNION: --> lookup_fast(%*.*s)\n",
+		       nd->last.len, nd->last.len, nd->last.name);
+
 	/*
 	 * Rename seqlock is not required here because in the off chance
 	 * of a false negative due to a concurrent rename, we're going to
 	 * do the non-racy lookup, below.
 	 */
 	if (nd->flags & LOOKUP_RCU) {
-		unsigned seq;
+		unsigned seq, pseq;
 		dentry = __d_lookup_rcu(parent, &nd->last, &seq);
-		if (!dentry)
+		if (!dentry) {
+			if (IS_PATH_UNIONED(&nd->path))
+				printk("UNION: __d_lookup_rcu\n");
 			goto unlazy;
+		}
 
 		/*
 		 * This sequence count validates that the inode matches
 		 * the dentry name information from lookup.
 		 */
-		*inode = dentry->d_inode;
+		*_inode = d_inode_or_lower(dentry);
 		if (read_seqcount_retry(&dentry->d_seq, seq))
 			return -ECHILD;
 
@@ -1374,7 +1779,8 @@
 		 * The memory barrier in read_seqcount_begin of child is
 		 *  enough, we can use __read_seqcount_retry here.
 		 */
-		if (__read_seqcount_retry(&parent->d_seq, nd->seq))
+		pseq = nd->seq;
+		if (__read_seqcount_retry(&parent->d_seq, pseq))
 			return -ECHILD;
 		nd->seq = seq;
 
@@ -1383,20 +1789,37 @@
 			if (unlikely(status <= 0)) {
 				if (status != -ECHILD)
 					need_reval = 0;
+				if (IS_PATH_UNIONED(&nd->path))
+					printk("UNION: d_revalidate\n");
 				goto unlazy;
 			}
 		}
 		path->mnt = mnt;
 		path->dentry = dentry;
-		if (unlikely(!__follow_mount_rcu(nd, path, inode)))
+		if (unlikely(!lookup_union_rcu(&nd->path, path, _inode))) {
+			if (IS_PATH_UNIONED(&nd->path))
+				printk("UNION: !lookup_union_rcu\n");
 			goto unlazy;
-		if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+		}
+		if (unlikely(!__follow_mount_rcu(nd, path, _inode))) {
+			if (IS_PATH_UNIONED(&nd->path))
+				printk("UNION: !__follow_mount_rcu\n");
 			goto unlazy;
+		}
+		if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT)) {
+			if (IS_PATH_UNIONED(&nd->path))
+				printk("UNION: need_automount\n");
+			goto unlazy;
+		}
 		return 0;
 unlazy:
+		if (IS_PATH_UNIONED(&nd->path))
+			printk("UNION: unlazy\n");
 		if (unlazy_walk(nd, dentry))
 			return -ECHILD;
 	} else {
+		if (IS_PATH_UNIONED(&nd->path))
+			printk("UNION: !RCU\n");
 		dentry = __d_lookup(parent, &nd->last);
 	}
 
@@ -1425,15 +1848,18 @@
 	}
 	if (err)
 		nd->flags |= LOOKUP_JUMPED;
-	*inode = path->dentry->d_inode;
+
+	*_inode = d_inode_or_lower(path->dentry);
 	return 0;
 
 need_lookup:
+	if (IS_PATH_UNIONED(&nd->path))
+		printk("UNION: need_lookup\n");
 	return 1;
 }
 
 /* Fast lookup failed, do it the slow way */
-static int lookup_slow(struct nameidata *nd, struct path *path)
+static noinline int lookup_slow(struct nameidata *nd, struct path *path)
 {
 	struct dentry *dentry, *parent;
 	int err;
@@ -1441,9 +1867,17 @@
 	parent = nd->path.dentry;
 	BUG_ON(nd->inode != parent->d_inode);
 
+	if (IS_PATH_UNIONED(&nd->path))
+		printk("UNION: --> lookup_slow(%*.*s)\n",
+		       nd->last.len, nd->last.len, nd->last.name);
+
 	mutex_lock(&parent->d_inode->i_mutex);
 	dentry = __lookup_hash(&nd->last, parent, nd->flags);
 	mutex_unlock(&parent->d_inode->i_mutex);
+
+	if (IS_PATH_UNIONED(&nd->path))
+		printk("UNION: slow: __lookup_hash() = %p\n", dentry);
+
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 	path->mnt = nd->path.mnt;
@@ -1458,7 +1892,7 @@
 	return 0;
 }
 
-static inline int may_lookup(struct nameidata *nd)
+static noinline int may_lookup(struct nameidata *nd)
 {
 	if (nd->flags & LOOKUP_RCU) {
 		int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
@@ -1505,11 +1939,16 @@
 	return unlikely(d_is_symlink(dentry)) ? follow : 0;
 }
 
-static inline int walk_component(struct nameidata *nd, struct path *path,
+static noinline int walk_component(struct nameidata *nd, struct path *path,
 		int follow)
 {
 	struct inode *inode;
 	int err;
+
+	if (IS_PATH_UNIONED(&nd->path))
+		printk("UNION: --> walk_component(%*.*s)\n",
+		       nd->last.len, nd->last.len, nd->last.name);
+
 	/*
 	 * "." and ".." are special - ".." especially so because it has
 	 * to be able to know about the current root directory and
@@ -1526,11 +1965,30 @@
 		if (err < 0)
 			goto out_err;
 
-		inode = path->dentry->d_inode;
+		inode = d_inode_or_lower(path->dentry);
 	}
-	err = -ENOENT;
-	if (!inode)
-		goto out_path_put;
+
+	if (IS_PATH_UNIONED(path)) {
+		printk("UNION: walk_comp: path->mnt UPPER%s\n",
+		       nd->flags & LOOKUP_RCU ? " RCU" : "");
+		printk("UNION: dentry %pd: %p(%p)\n",
+		       path->dentry, path->dentry, path->dentry->d_inode);
+	}
+
+	if (!inode) {
+		if (likely(!IS_MNT_UNION(path->mnt)))
+			goto enoent;
+
+		err = union_lookup_point(nd, path, false);
+		if (err < 0)
+			goto out_path_put;
+		if (should_follow_link(path->dentry, follow))
+			return 1;
+		inode = path->dentry->d_inode;
+		if (!inode && nd->flags & LOOKUP_PARENT)
+			goto enoent;
+		goto success;
+	}
 
 	if (should_follow_link(path->dentry, follow)) {
 		if (nd->flags & LOOKUP_RCU) {
@@ -1539,13 +1997,15 @@
 				goto out_err;
 			}
 		}
-		BUG_ON(inode != path->dentry->d_inode);
 		return 1;
 	}
+success:
 	path_to_nameidata(path, nd);
 	nd->inode = inode;
 	return 0;
 
+enoent:
+	err = -ENOENT;
 out_path_put:
 	path_to_nameidata(path, nd);
 out_err:
@@ -1925,6 +2385,9 @@
 
 	current->total_link_count = 0;
 	err = link_path_walk(name, nd);
+	if (!err && IS_PATH_UNIONED(&nd->path))
+		printk("UNION: link_path_walk returned nd->path.mnt UPPER%s\n>>>\n",
+		       flags & LOOKUP_PARENT ? " PARENT" : "");
 
 	/* At this point we've processed all the non-terminal parts of the path
 	 * and are ready to tackle the final section.  The final section may
@@ -1936,16 +2399,33 @@
 		while (err > 0) {
 			void *cookie;
 			struct path link = terminal_symlink;
+			if (IS_PATH_UNIONED(&nd->path))
+				printk("UNION: path_lookupat: may_follow_link\n");
 			err = may_follow_link(&link, nd);
 			if (unlikely(err))
 				break;
 			nd->flags |= LOOKUP_PARENT;
+			if (IS_PATH_UNIONED(&nd->path))
+				printk("UNION: path_lookupat: follow_link\n");
 			err = follow_link(&link, nd, &cookie);
 			if (err)
 				break;
+			if (IS_PATH_UNIONED(&nd->path))
+				printk("UNION: path_lookupat: lookup_last\n");
 			err = lookup_last(nd, &terminal_symlink);
+			if (IS_PATH_UNIONED(&nd->path))
+				printk("UNION: path_lookupat: put_link\n");
 			put_link(nd, &link, cookie);
 		}
+
+		if (!err) {
+			if (!nd)
+				printk("UNION: path_lookupat: !nd\n");
+			else if (!nd->path.mnt)
+				printk("UNION: path_lookupat: !nd->path.mnt\n");
+			else if (IS_PATH_UNIONED(&nd->path))
+				printk("UNION: path_lookupat: nd->path.mnt UPPER\n");
+		}
 	}
 
 	if (!err)
@@ -2049,22 +2529,33 @@
 
 /*
  * Restricted form of lookup. Doesn't follow links, single-component only,
- * needs parent already locked. Doesn't follow mounts.
- * SMP-safe.
+ * needs parent already locked. Doesn't follow mounts.  Does annotate the
+ * dentry for unionmount.  SMP-safe.
  */
 static int lookup_hash(struct nameidata *nd, struct path *path)
 {
 	struct dentry *result;
+	int ret;
 
 	result = __lookup_hash(&nd->last, nd->path.dentry, nd->flags);
 	if (IS_ERR(result)) {
-		path->mnt = NULL;
-		path->dentry = NULL;
-		return PTR_ERR(result);
+		ret = PTR_ERR(result);
+		goto error;
 	}
+
 	path->mnt = nd->path.mnt;
 	path->dentry = result;
+	ret = union_lookup_point_locked(&nd->path, path);
+	if (ret)
+		goto error_dput;
 	return 0;
+
+error_dput:
+	dput(path->dentry);
+error:
+	path->mnt = NULL;
+	path->dentry = NULL;
+	return ret;
 }
 
 /**
@@ -2401,12 +2892,15 @@
  */
 static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
 {
-	struct inode *inode = victim->d_inode;
+	struct inode *inode = d_inode_or_lower(victim);
 	int error;
 
 	if (d_is_negative(victim))
 		return -ENOENT;
-	BUG_ON(!inode);
+	if (!inode) {
+		pr_err("### DENTRY %pd {%x}\n", victim, victim->d_flags);
+		BUG();
+	}
 
 	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
 
@@ -2513,10 +3007,8 @@
 	return error;
 }
 
-static int may_open(struct path *path, int acc_mode, int flag)
+static int may_open(struct path *path, struct inode *inode, int acc_mode, int flag)
 {
-	struct dentry *dentry = path->dentry;
-	struct inode *inode = dentry->d_inode;
 	int error;
 
 	/* O_PATH? */
@@ -2537,7 +3029,7 @@
 	case S_IFCHR:
 		if (path->mnt->mnt_flags & MNT_NODEV)
 			return -EACCES;
-		/*FALLTHRU*/
+		/* fallthrough */
 	case S_IFIFO:
 	case S_IFSOCK:
 		flag &= ~O_TRUNC;
@@ -2637,6 +3129,10 @@
 
 	BUG_ON(dentry->d_inode);
 
+	if (IS_PATH_UNIONED(&nd->path))
+		printk("UNION: --> atomic_open(%*.*s)\n",
+		       nd->last.len, nd->last.len, nd->last.name);
+
 	/* Don't create child dentry for a dead directory. */
 	if (unlikely(IS_DEADDIR(dir))) {
 		error = -ENOENT;
@@ -2738,7 +3234,7 @@
 		fsnotify_create(dir, dentry);
 		acc_mode = MAY_OPEN;
 	}
-	error = may_open(&file->f_path, acc_mode, open_flag);
+	error = may_open(&file->f_path, file->f_inode, acc_mode, open_flag);
 	if (error)
 		fput(file);
 
@@ -2791,6 +3287,10 @@
  *
  * FILE_CREATE will be set in @*opened if the dentry was created and will be
  * cleared otherwise prior to returning.
+ *
+ * If an entry on a union mount is being considered, we pass back a file from
+ * the lower layer if there is one and leave it up to do_last() to copy up if
+ * need be.
  */
 static int lookup_open(struct nameidata *nd, struct path *path,
 			struct file *file,
@@ -2803,24 +3303,43 @@
 	int error;
 	bool need_lookup;
 
+	if (IS_PATH_UNIONED(&nd->path))
+		printk("UNION: --> lookup_open(%*.*s)\n",
+		       nd->last.len, nd->last.len, nd->last.name);
+
 	*opened &= ~FILE_CREATED;
 	dentry = lookup_dcache(&nd->last, dir, nd->flags, &need_lookup);
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
+	if (IS_PATH_UNIONED(&nd->path))
+		printk("UNION: lookup_dcache() = %p [%p]\n",
+		       dentry, dentry ? dentry->d_inode : NULL);
+
 	/* Cached positive dentry: will open in f_op->open */
 	if (!need_lookup && dentry->d_inode)
 		goto out_no_open;
 
-	if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) {
+	/* Perform an atomic open if that is available - but not if a file on
+	 * the upper filesystem of a union is being opened for writing
+	 */
+	if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open &&
+	    !(IS_MNT_UNION(nd->path.mnt) &&
+	      op->acc_mode & (MAY_WRITE | MAY_APPEND))) {
+		if (IS_PATH_UNIONED(&nd->path))
+			printk("UNION: open atomic\n");
 		return atomic_open(nd, dentry, path, file, op, got_write,
 				   need_lookup, opened);
 	}
 
+	if (IS_PATH_UNIONED(&nd->path))
+		printk("UNION: don't open atomic\n");
 	if (need_lookup) {
 		BUG_ON(dentry->d_inode);
 
 		dentry = lookup_real(dir_inode, dentry, nd->flags);
+		if (IS_PATH_UNIONED(&nd->path))
+			printk("UNION: lookup_real() = %p\n", dentry);
 		if (IS_ERR(dentry))
 			return PTR_ERR(dentry);
 	}
@@ -2830,8 +3349,8 @@
 		umode_t mode = op->mode;
 		if (!IS_POSIXACL(dir->d_inode))
 			mode &= ~current_umask();
-		/*
-		 * This write is needed to ensure that a
+
+		/* This write is needed to ensure that a
 		 * rw->ro transition does not occur between
 		 * the time when the file is created and when
 		 * a permanent write count is taken through
@@ -2841,16 +3360,81 @@
 			error = -EROFS;
 			goto out_dput;
 		}
+
+		/* If the negative dentry is on the upper layer of a union
+		 * mount then we may need to copy up or turn a whiteout into a
+		 * file.  The negative dentry will not be on a lower layer at
+		 * this point.
+		 *
+		 * If the dentry is a whiteout or a normal negative dentry in
+		 * an opaque directory then we can just create over it.
+		 *
+		 * If O_CREAT|O_TRUNC|O_EXCL is specified then we fail if
+		 * there's a file in the lower layer or succeed without copying
+		 * up otherwise.
+		 *
+		 * If O_CREAT|O_TRUNC is specified then we need to copy up the
+		 * attributes if there's a lower file.
+		 *
+		 * If O_CREAT|O_RDONLY is specified and the file exists in the
+		 * lower layer, we just use the lower file.
+		 *
+		 * Otherwise we need to copy up the whole file.
+		 */
+		if (IS_PATH_UNIONED(&nd->path)) {
+			struct path tmp = {
+				.mnt = nd->path.mnt,
+				.dentry = dentry,
+			};
+
+			printk("UNION: deal with O_CREAT\n");
+
+			error = union_lookup_point_locked(&nd->path, &tmp);
+			if (error == -ENOENT)
+				goto just_create;
+			if (error < 0)
+				goto out_dput;
+			if (d_is_directory(dentry)) {
+				error = -EISDIR;
+				if (op->open_flag & O_EXCL)
+					error = -EEXIST;
+				goto out_dput;
+			}
+
+			if (d_is_symlink(dentry))
+				goto out_no_open;
+			if (d_is_negative(dentry)) {
+				printk("UNION: lower blocked\n");
+				goto just_create; /* Lower is blocked off */
+			}
+
+			printk("UNION: deal with O_CREAT\n");
+			if (d_is_pinning_lower(dentry)) {
+				BUG_ON(!d_get_fallthru(dentry)->d_inode);
+				printk("UNION: lower available (O_CREAT ignored)\n");
+				goto out_no_open;
+			}
+
+			printk("UNION: create over lower\n");
+		}
+
+	just_create:
 		*opened |= FILE_CREATED;
 		error = security_path_mknod(&nd->path, dentry, mode, 0);
 		if (error)
 			goto out_dput;
 		error = vfs_create(dir->d_inode, dentry, mode,
 				   nd->flags & LOOKUP_EXCL);
+		if (IS_PATH_UNIONED(&nd->path))
+			printk("UNION: vfs_create() = %d [%pd: %p]\n",
+			       error, dentry, dentry);
 		if (error)
 			goto out_dput;
 	}
+
 out_no_open:
+	if (IS_PATH_UNIONED(&nd->path))
+		printk("UNION: out_no_open\n");
 	path->dentry = dentry;
 	path->mnt = nd->path.mnt;
 	return 1;
@@ -2878,6 +3462,9 @@
 	bool retried = false;
 	int error;
 
+	if (IS_PATH_UNIONED(&nd->path))
+		printk("UNION: --> do_last()\n");
+
 	nd->flags &= ~LOOKUP_PARENT;
 	nd->flags |= op->intent;
 
@@ -2895,8 +3482,11 @@
 			symlink_ok = true;
 		/* we _can_ be in RCU mode here */
 		error = lookup_fast(nd, path, &inode);
-		if (likely(!error))
+		if (likely(!error)) {
+			if (IS_PATH_UNIONED(&nd->path))
+				printk("UNION: --> do_last: goto finish_lookup\n");
 			goto finish_lookup;
+		}
 
 		if (error < 0)
 			goto out;
@@ -2923,18 +3513,24 @@
 retry_lookup:
 	if (op->open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
 		error = mnt_want_write(nd->path.mnt);
-		if (!error)
+		if (!error) {
+			if (IS_PATH_UNIONED(&nd->path))
+				printk("UNION: got_write = true\n");
 			got_write = true;
+		}
 		/*
 		 * do _not_ fail yet - we might not need that or fail with
 		 * a different error; let lookup_open() decide; we'll be
 		 * dropping this one anyway.
 		 */
 	}
+
 	mutex_lock(&dir->d_inode->i_mutex);
 	error = lookup_open(nd, path, file, op, got_write, opened);
 	mutex_unlock(&dir->d_inode->i_mutex);
 
+	if (IS_PATH_UNIONED(&nd->path))
+		printk("UNION: lookup_open() = %d\n", error);
 	if (error <= 0) {
 		if (error)
 			goto out;
@@ -2947,12 +3543,16 @@
 		goto opened;
 	}
 
+	/* At this point, the file may have been looked up and created or
+	 * truncated but hasn't been opened yet - however, since we dropped the
+	 * lock, things may have changed in the filesystem.
+	 */
 	if (*opened & FILE_CREATED) {
 		/* Don't check for write permission, don't truncate */
 		open_flag &= ~O_TRUNC;
 		will_truncate = false;
 		acc_mode = MAY_OPEN;
-		inode = path->dentry->d_inode;
+		inode = d_inode_or_lower(path->dentry);
 		path_to_nameidata(path, nd);
 		goto finish_open_created;
 	}
@@ -2962,7 +3562,7 @@
 	 */
 	if (d_is_positive(path->dentry)) {
 		audit_inode(name, path->dentry, 0);
-		inode = path->dentry->d_inode;
+		inode = d_inode_or_lower(path->dentry);
 	}
 
 	/*
@@ -2976,40 +3576,138 @@
 	}
 
 	error = -EEXIST;
-	if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))
+	if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) {
+		if (IS_PATH_UNIONED(&nd->path))
+			printk("UNION: created, but O_EXCL\n");
 		goto exit_dput;
+	}
+
+	if (IS_PATH_UNIONED(path) &&
+	    d_is_pinning_lower(path->dentry) &&
+	    d_managed(d_get_fallthru(path->dentry))) {
+		error = -EREMOTE;
+		goto exit_dput;
+	}
 
 	error = follow_managed(path, nd->flags);
-	if (error < 0)
+	if (error < 0) {
+		if (IS_PATH_UNIONED(&nd->path))
+			printk("UNION: follow_managed() = %d\n", error);
 		goto exit_dput;
+	}
 
 	if (error)
 		nd->flags |= LOOKUP_JUMPED;
 
 	BUG_ON(nd->flags & LOOKUP_RCU);
-	inode = path->dentry->d_inode;
+	inode = d_inode_or_lower(path->dentry);
 finish_lookup:
+	if (IS_MNT_UNION(path->mnt))
+		printk("UNION: do_last: finish_lookup: at upper\n");
+	if (IS_PATH_UNIONED(&nd->path))
+		printk("UNION: do_last: nd->path.mnt at upper\n");
 	/* we _can_ be in RCU mode here */
-	error = -ENOENT;
 	if (d_is_negative(path->dentry)) {
-		path_to_nameidata(path, nd);
-		goto out;
+		if (likely(!IS_PATH_UNIONED(&nd->path)))
+			goto noent;
+
+		printk("UNION: %pd: d_flags = %x\n",
+		       path->dentry, path->dentry->d_flags);
+
+		error = union_lookup_point(nd, path, got_write);
+		if (error)
+			goto exit_dput;
+
+		if (d_is_negative(path->dentry))
+			goto noent;
+
+		inode = d_inode_or_lower(path->dentry);
+		if (!inode)
+			goto noent;
+
+		printk("UNION: got lower from d_fallthru\n");
 	}
 
 	if (should_follow_link(path->dentry, !symlink_ok)) {
+		/* The dentry is either a symlink on this fs or it's a
+		 * fallthrough to a symlink in a lower fs (in which case inode
+		 * will be NULL).
+		 */
+		if (IS_PATH_UNIONED(&nd->path))
+			printk("UNION: should_follow_link() -> true\n");
 		if (nd->flags & LOOKUP_RCU) {
 			if (unlikely(unlazy_walk(nd, path->dentry))) {
 				error = -ECHILD;
 				goto out;
 			}
 		}
-		BUG_ON(inode != path->dentry->d_inode);
+		if (got_write)
+			mnt_drop_write(nd->path.mnt);
 		return 1;
 	}
 
+	if (IS_PATH_UNIONED(&nd->path) &&
+	    !path->dentry->d_inode &&
+	    (op->acc_mode & (MAY_WRITE | MAY_APPEND) ||
+	     op->open_flag & O_TRUNC) &&
+	    S_ISREG(inode->i_mode)) {
+		printk("UNION: WWWWW Need to copy up\n");
+
+		if (nd->flags & LOOKUP_RCU &&
+		    unlikely(unlazy_walk(nd, path->dentry))) {
+			path_to_nameidata(path, nd);
+			error = -ECHILD;
+			goto out;
+		}
+
+		if (op->open_flag & O_DIRECTORY) {
+			error = -ENOTDIR;
+			goto exit_dput;
+		}
+
+		/* Like inode_permission(), but inode->i_sb != dentry->d_sb */
+		error = sb_permission(path->dentry->d_sb, inode, MAY_WRITE);
+		if (error < 0)
+			goto exit_dput;
+		error = __inode_permission(inode, MAY_WRITE);
+		if (error < 0)
+			goto exit_dput;
+
+		error = mnt_want_write(nd->path.mnt);
+		if (error)
+			goto exit_dput;
+
+		error = union_copy_up_for_do_last(&nd->path, path, will_truncate);
+		mnt_drop_write(nd->path.mnt);
+		if (error)
+			goto exit_dput;
+
+		if (path->mnt != nd->path.mnt)
+			printk("UNION: !!! mnt not changed by copyup\n");
+
+		printk("UNION: copied up lower\n");
+		BUG_ON(path->mnt != nd->path.mnt);
+
+		inode = path->dentry->d_inode;
+		if (!inode)
+			goto noent;
+
+		open_flag &= ~O_TRUNC;
+		will_truncate = false;
+		acc_mode = MAY_OPEN;
+	}
+
 	if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path->mnt) {
+		if (IS_PATH_UNIONED(&nd->path)) {
+			if (nd->flags & LOOKUP_RCU)
+				printk("UNION: in rcu mode\n");
+			if (nd->path.mnt != path->mnt)
+				printk("UNION: nd->path.mnt != path->mnt\n");
+		}
 		path_to_nameidata(path, nd);
 	} else {
+		if (IS_PATH_UNIONED(&nd->path))
+			printk("UNION: not in rcu mode\n");
 		save_parent.dentry = nd->path.dentry;
 		save_parent.mnt = mntget(path->mnt);
 		nd->path.dentry = path->dentry;
@@ -3018,7 +3716,11 @@
 	nd->inode = inode;
 	/* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
 finish_open:
+	if (IS_PATH_UNIONED(&nd->path))
+		printk("UNION: --> complete_walk()\n");
 	error = complete_walk(nd);
+	if (IS_PATH_UNIONED(&nd->path))
+		printk("UNION: <-- complete_walk() = %d\n", error);
 	if (error) {
 		path_put(&save_parent);
 		return error;
@@ -3030,23 +3732,30 @@
 	    (d_is_directory(nd->path.dentry) || d_is_autodir(nd->path.dentry)))
 		goto out;
 	error = -ENOTDIR;
-	if ((nd->flags & LOOKUP_DIRECTORY) && !d_is_directory(nd->path.dentry))
+	if ((nd->flags & LOOKUP_DIRECTORY) && !d_is_directory(nd->path.dentry)) {
+		if (IS_PATH_UNIONED(&nd->path))
+			printk("UNION: !can_lookup\n");
 		goto out;
+	}
 	if (!S_ISREG(inode->i_mode))
 		will_truncate = false;
 
-	if (will_truncate) {
+	if (will_truncate && !got_write) {
 		error = mnt_want_write(nd->path.mnt);
 		if (error)
 			goto out;
 		got_write = true;
 	}
 finish_open_created:
-	error = may_open(&nd->path, acc_mode, open_flag);
+	error = may_open(&nd->path, inode, acc_mode, open_flag);
+	if (IS_PATH_UNIONED(&nd->path))
+		printk("UNION: <-- may_open() = %d\n", error);
 	if (error)
 		goto out;
 	file->f_path.mnt = nd->path.mnt;
 	error = finish_open(file, nd->path.dentry, inode, NULL, opened);
+	if (IS_PATH_UNIONED(&nd->path))
+		printk("UNION: <-- finish_open() = %d\n", error);
 	if (error) {
 		if (error == -EOPENSTALE)
 			goto stale_open;
@@ -3072,6 +3781,13 @@
 	terminate_walk(nd);
 	return error;
 
+noent:
+	if (IS_PATH_UNIONED(&nd->path))
+		printk("UNION: %pd: ENOENT\n", path->dentry);
+	path_to_nameidata(path, nd);
+	error = -ENOENT;
+	goto out;
+
 exit_dput:
 	path_put_conditional(path, nd);
 	goto out;
@@ -3136,7 +3852,7 @@
 	if (error)
 		goto out2;
 	audit_inode(pathname, nd->path.dentry, 0);
-	error = may_open(&nd->path, op->acc_mode, op->open_flag);
+	error = may_open(&nd->path, nd->path.dentry->d_inode, op->acc_mode, op->open_flag);
 	if (error)
 		goto out2;
 	file->f_path.mnt = nd->path.mnt;
@@ -3327,6 +4043,7 @@
 	}
 	*path = nd.path;
 	return new_path.dentry;
+
 fail:
 	dput(new_path.dentry);
 unlock:
@@ -3472,8 +4189,17 @@
 		return -EMLINK;
 
 	error = dir->i_op->mkdir(dir, dentry, mode);
-	if (!error)
-		fsnotify_mkdir(dir, dentry);
+	if (error)
+		return error;
+
+	/* XXX racy - crash now and dir isn't opaque */
+	if (IS_DIR_UNIONED(dentry->d_parent)) {
+		dentry->d_inode->i_flags |= S_OPAQUE;
+		mark_inode_dirty(dentry->d_inode);
+	}
+
+	fsnotify_mkdir(dir, dentry);
+
 	return error;
 }
 
@@ -3507,6 +4233,145 @@
 	return sys_mkdirat(AT_FDCWD, pathname, mode);
 }
 
+/**
+ * vfs_whiteout: Create a whiteout for the given directory entry
+ * @parent: Parent directory
+ * @old_path: Directory entry to whiteout
+ * @isdir: The file at @old_path is a directory
+ *
+ * Create a whiteout for the given directory entry.  A whiteout prevents lookup
+ * from dropping down to a lower layer of a union mounted file system.
+ *
+ * There are two important cases: (a) The directory entry to be whited out may
+ * already exist, in which case it must first be deleted before we create the
+ * whiteout, and (b) no such directory entry exists and we only have to create
+ * the whiteout itself.
+ *
+ * The caller must pass in a dentry for the directory entry to be whited out -
+ * a positive one if it exists, and a negative if not.  When this function
+ * returns, the caller should dput() the old, now defunct dentry it passed in.
+ * The dentry for the whiteout itself is created inside this function.
+ *
+ * The caller must hold the i_mutex lock on the parent directory.
+ */
+static int vfs_whiteout(struct dentry *parent, struct path *old_path, int isdir)
+{
+	struct dentry *old_dentry = old_path->dentry;
+	struct inode *dir = parent->d_inode, *old_inode = old_dentry->d_inode;
+	int err = 0;
+
+	BUG_ON(old_dentry->d_parent != parent);
+
+	if (!dir->i_op || !dir->i_op->whiteout)
+		return -EOPNOTSUPP;
+
+	/* If the old dentry is positive, then we have to delete this entry
+	 * before we create the whiteout.  The file system ->whiteout() op does
+	 * the actual delete, but we do all the VFS-level checks and changes
+	 * here.
+	 */
+	if (old_inode) {
+		mutex_lock(&old_inode->i_mutex);
+		if (d_mountpoint(old_dentry)) {
+			mutex_unlock(&old_inode->i_mutex);
+			return -EBUSY;
+		}
+		if (isdir)
+			err = security_inode_rmdir(dir, old_dentry);
+		else
+			err = security_inode_unlink(dir, old_dentry);
+		if (err)
+			goto error_unlock;
+
+		/* If we're removing a directory, we need to work out if it is
+		 * empty - but if the directory has not yet been copied up, we
+		 * cannot tell that by simply reading the lower dirs.  We have
+		 * to subtract the set of whiteouts in the top dir from the
+		 * union of the sets of dirents from the lower dirs - ie. do a
+		 * copyup.
+		 */
+		if (isdir) {
+			err = union_copy_up_dir(old_path);
+			if (err)
+				goto error_unlock;
+		}
+	}
+
+	err = dir->i_op->whiteout(dir, old_dentry);
+	if (err)
+		goto error_unlock;
+
+	if (old_inode) {
+		mutex_unlock(&old_inode->i_mutex);
+		if (isdir) {
+			old_inode->i_flags |= S_DEAD;
+			dont_mount(old_dentry);
+		} else {
+			fsnotify_link_count(old_inode);
+		}
+		d_drop(old_dentry);
+	}
+	return err;
+
+error_unlock:
+	if (old_inode)
+		mutex_unlock(&old_inode->i_mutex);
+	return err;
+}
+
+static int do_whiteout(struct nameidata *nd, struct path *path, int isdir)
+{
+	struct path safe = nd->path;
+	struct dentry *dentry = path->dentry;
+	int err;
+
+	path_get(&safe);
+
+	err = may_delete(nd->path.dentry->d_inode, dentry, isdir);
+	if (err)
+		goto out;
+
+	err = vfs_whiteout(nd->path.dentry, path, isdir);
+
+out:
+	path_put(&safe);
+	return err;
+}
+
+/*
+ * Create a whiteout to finish off a rename from a unionmounted directory.
+ * This prevents any file of the same name in the lowerfs from showing through.
+ */
+static int vfs_whiteout_after_rename(struct dentry *parent,
+				     const struct qstr *name)
+{
+	struct inode *dir = parent->d_inode;
+	struct dentry *dummy;
+	int err;
+
+	if (!dir->i_op || !dir->i_op->whiteout)
+		return -EOPNOTSUPP;
+
+	/* Rename moved the old dentry somewhere else, so there can't be one
+	 * here now (the caller's locks see to that) and so there's no need to
+	 * call lookup, especially as the ->whiteout() op is expected to add
+	 * the new dentry into the tree.
+	 */
+	dummy = d_alloc(parent, name);
+	if (!dummy)
+		return -ENOMEM;
+
+	/* I think it's okay to pass the new whiteout as the old dentry here.
+	 * What it seems to want is the name, the parent dentry and the inode.
+	 * However, we know the inode no longer resides there and d_inode will
+	 * be NULL.
+	 */
+	err = dir->i_op->whiteout(dir, dummy);
+
+	dput(dummy);
+	return err;
+}
+
 /*
  * The dentry_unhash() helper will try to drop the dentry early: we
  * should have a usage count of 1 if we're the only user of this
@@ -3601,14 +4466,13 @@
 	error = lookup_hash(&nd, &path);
 	if (error)
 		goto exit2;
-	if (!path.dentry->d_inode) {
-		error = -ENOENT;
-		goto exit3;
-	}
 	error = security_path_rmdir(&nd.path, path.dentry);
 	if (error)
 		goto exit3;
-	error = vfs_rmdir(nd.path.dentry->d_inode, path.dentry);
+	if (IS_DIR_UNIONED(nd.path.dentry))
+		error = do_whiteout(&nd, &path, 1);
+	else
+		error = vfs_rmdir(nd.path.dentry->d_inode, path.dentry);
 exit3:
 	path_put_conditional(&path, &nd);
 exit2:
@@ -3699,6 +4563,7 @@
 	struct inode *inode = NULL;
 	struct inode *delegated_inode = NULL;
 	unsigned int lookup_flags = 0;
+
 retry:
 	name = user_path_parent(dfd, pathname, &nd, lookup_flags);
 	if (IS_ERR(name))
@@ -3715,22 +4580,47 @@
 retry_deleg:
 	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
 	error = lookup_hash(&nd, &path);
-	if (!error) {
-		/* Why not before? Because we want correct error value */
-		if (nd.last.name[nd.last.len])
-			goto slashes;
+	if (error)
+		goto exit2;
+
+	/* Why not before? Because we want correct error value */
+	if (d_is_negative(path.dentry)) {
+		if (IS_PATH_UNIONED(&nd.path))
+			printk("UNION: unlink neg\n");
+		BUG_ON(path.dentry->d_inode);
+		error = -ENOENT;
+	} else if (nd.last.name[nd.last.len]) {
+		if (IS_PATH_UNIONED(&nd.path))
+			printk("UNION: unlink slash: %x\n", path.dentry->d_flags);
+		BUG_ON(!(path.dentry->d_flags & DCACHE_UNION_PINNING_LOWER) &&
+		       !path.dentry->d_inode);
+		error = d_is_directory(path.dentry) ? -EISDIR : -ENOTDIR;
+	} else if (!path.dentry->d_inode) {
+		if (IS_PATH_UNIONED(&nd.path))
+			printk("UNION: unlink lower\n");
+		error = security_path_unlink(&nd.path, path.dentry);
+		if (!error) {
+			if (IS_PATH_UNIONED(&nd.path))
+				printk("UNION: call do_whiteout()\n");
+			error = do_whiteout(&nd, &path, 0);
+		}
+	} else {
+		if (IS_PATH_UNIONED(&nd.path))
+			printk("UNION: unlink upper\n");
 		inode = path.dentry->d_inode;
-		if (d_is_negative(path.dentry))
-			goto slashes;
 		ihold(inode);
 		error = security_path_unlink(&nd.path, path.dentry);
-		if (error)
-			goto exit2;
-		error = vfs_unlink(nd.path.dentry->d_inode, path.dentry,
-				   &delegated_inode);
-exit2:
-		path_put_conditional(&path, &nd);
+		if (!error) {
+			if (IS_DIR_UNIONED(nd.path.dentry))
+				error = do_whiteout(&nd, &path, 0);
+			else
+				error = vfs_unlink(nd.path.dentry->d_inode, path.dentry,
+						   &delegated_inode);
+		}
 	}
+
+	path_put_conditional(&path, &nd);
+exit2:
 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
 	if (inode)
 		iput(inode);	/* truncate the inode here */
@@ -3750,15 +4640,6 @@
 		goto retry;
 	}
 	return error;
-
-slashes:
-	if (d_is_negative(path.dentry))
-		error = -ENOENT;
-	else if (d_is_directory(path.dentry) || d_is_autodir(path.dentry))
-		error = -EISDIR;
-	else
-		error = -ENOTDIR;
-	goto exit2;
 }
 
 SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
@@ -3918,8 +4799,8 @@
 		int, newdfd, const char __user *, newname, int, flags)
 {
 	struct dentry *new_dentry;
-	struct path old_path, new_path;
-	struct inode *delegated_inode = NULL;
+	struct path old_path, new_path, lower_cache, actual;
+	struct inode *inode, *delegated_inode = NULL;
 	int how = 0;
 	int error;
 
@@ -3938,11 +4819,22 @@
 
 	if (flags & AT_SYMLINK_FOLLOW)
 		how |= LOOKUP_FOLLOW;
+
 retry:
 	error = user_path_at(olddfd, oldname, how, &old_path);
 	if (error)
 		return error;
 
+	inode = union_get_inode(&old_path, &lower_cache, &actual);
+	if (IS_ERR(inode)) {
+		error = PTR_ERR(inode);
+		goto out;
+	}
+	error = union_copy_up(&old_path, &actual);
+	path_put_maybe(&lower_cache);
+	if (error < 0)
+		goto out;
+
 	new_dentry = user_path_create(newdfd, newname, &new_path,
 					(how & LOOKUP_REVAL));
 	error = PTR_ERR(new_dentry);
@@ -4168,7 +5060,8 @@
 		int, newdfd, const char __user *, newname)
 {
 	struct dentry *old_dir, *new_dir;
-	struct path old, new;
+	struct inode *old_inode;
+	struct path old, new, old_lower_cache, old_actual;
 	struct dentry *trap;
 	struct nameidata oldnd, newnd;
 	struct inode *delegated_inode = NULL;
@@ -4193,7 +5086,6 @@
 	error = -EXDEV;
 	if (oldnd.path.mnt != newnd.path.mnt)
 		goto exit2;
-
 	old_dir = oldnd.path.dentry;
 	error = -EBUSY;
 	if (oldnd.last_type != LAST_NORM)
@@ -4218,6 +5110,7 @@
 	if (error)
 		goto exit3;
 	/* source must exist */
+	old_inode = d_inode_or_lower(old.dentry);
 	error = -ENOENT;
 	if (d_is_negative(old.dentry))
 		goto exit4;
@@ -4233,6 +5126,11 @@
 	error = -EINVAL;
 	if (old.dentry == trap)
 		goto exit4;
+	error = -EXDEV;
+	/* Can't rename a directory from a lower layer */
+	if (IS_DIR_UNIONED(oldnd.path.dentry) &&
+	    IS_DIR_UNIONED(old.dentry))
+		goto exit4;
 	error = lookup_hash(&newnd, &new);
 	if (error)
 		goto exit4;
@@ -4240,17 +5138,44 @@
 	error = -ENOTEMPTY;
 	if (new.dentry == trap)
 		goto exit5;
+	error = -EXDEV;
+	/* Can't rename over directories on the lower layer */
+	if (IS_DIR_UNIONED(newnd.path.dentry) &&
+	    IS_DIR_UNIONED(new.dentry))
+		goto exit5;
 
 	error = security_path_rename(&oldnd.path, old.dentry,
-				     &newnd.path, new.dentry);
+				     &newnd.path, new.dentry,
+				     old_inode);
 	if (error)
 		goto exit5;
+
+	error = union_copy_up_locked(&oldnd.path, &old, &old_actual);
+	if (error)
+		goto exit5;
+
 	error = vfs_rename(old_dir->d_inode, old.dentry,
 				   new_dir->d_inode, new.dentry,
 				   &delegated_inode);
+	if (error)
+		goto exit5;
+
+	/* Now whiteout the source.  We may have exposed a positive lower level
+	 * dentry, so we have to make sure it doesn't get resurrected.  We
+	 * could probe the lower levels at this point to find out whether there
+	 * is actually anything that needs whiting out.
+	 *
+	 * Note that if this fails, it may leave the lower dentry exposed, and
+	 * we may not be able to recover by simply renaming back (say we
+	 * encountered ENOMEM or ENOSPC conditions).
+	 */
+	if (IS_DIR_UNIONED(oldnd.path.dentry))
+		error = vfs_whiteout_after_rename(old_dir, &oldnd.last);
+
 exit5:
 	path_put_conditional(&new, &newnd);
 exit4:
+	path_put_maybe(&old_lower_cache);
 	path_put_conditional(&old, &oldnd);
 exit3:
 	unlock_rename(new_dir, old_dir);
@@ -4311,6 +5236,7 @@
 	int res;
 
 	nd.depth = 0;
+	dentry = d_dentry_or_lower(dentry);
 	cookie = dentry->d_inode->i_op->follow_link(dentry, &nd);
 	if (IS_ERR(cookie))
 		return PTR_ERR(cookie);
diff --git a/fs/namespace.c b/fs/namespace.c
index 6a147ab..e1a6a16 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -25,6 +25,7 @@
 #include <linux/magic.h>
 #include "pnode.h"
 #include "internal.h"
+#include "union.h"
 
 #define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head))
 #define HASH_SIZE (1UL << HASH_SHIFT)
@@ -1403,10 +1404,9 @@
 
 #endif
 
-static bool is_mnt_ns_file(struct dentry *dentry)
+static bool is_mnt_ns_file(struct inode *inode)
 {
 	/* Is this a proxy for a mount namespace? */
-	struct inode *inode = dentry->d_inode;
 	struct proc_ns *ei;
 
 	if (!proc_ns_inode(inode))
@@ -1419,16 +1419,16 @@
 	return true;
 }
 
-static bool mnt_ns_loop(struct dentry *dentry)
+static bool mnt_ns_loop(struct inode *inode)
 {
 	/* Could bind mounting the mount namespace inode cause a
 	 * mount namespace loop?
 	 */
 	struct mnt_namespace *mnt_ns;
-	if (!is_mnt_ns_file(dentry))
+	if (!is_mnt_ns_file(inode))
 		return false;
 
-	mnt_ns = get_proc_ns(dentry->d_inode)->ns;
+	mnt_ns = get_proc_ns(inode)->ns;
 	return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
 }
 
@@ -1440,7 +1440,7 @@
 	if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))
 		return ERR_PTR(-EINVAL);
 
-	if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
+	if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry->d_inode))
 		return ERR_PTR(-EINVAL);
 
 	res = q = clone_mnt(mnt, dentry, flag);
@@ -1463,7 +1463,7 @@
 				continue;
 			}
 			if (!(flag & CL_COPY_MNT_NS_FILE) &&
-			    is_mnt_ns_file(s->mnt.mnt_root)) {
+			    is_mnt_ns_file(s->mnt.mnt_root->d_inode)) {
 				s = skip_mnt_tree(s);
 				continue;
 			}
@@ -1557,6 +1557,202 @@
 	return 0;
 }
 
+/**
+ * check_topmost_union_mnt - mount-time checks for union mount
+ * @topmost_mnt: vfsmount of the topmost union filed system
+ * @mnt_flags: mount flags for the topmost mount
+ *
+ * Our readdir() solution of copying up directory entries requires
+ * that the topmost layer be writeable and support whiteouts and
+ * fallthrus.  The topmost file system can't be mounted elsewhere
+ * because it's Too Hard(tm).
+ */
+static int check_topmost_union_mnt(struct mount *topmost_mnt, int mnt_flags)
+{
+#ifndef CONFIG_UNION_MOUNT
+	printk(KERN_INFO "union mount: not supported by the kernel\n");
+	return -EINVAL;
+#else
+	struct super_block *sb = topmost_mnt->mnt.mnt_sb;
+
+	if (mnt_flags & MNT_READONLY)
+		return -EROFS;
+
+	if (atomic_read(&sb->s_active) != 1) {
+		printk(KERN_INFO "union mount: topmost fs mounted elsewhere\n");
+		return -EBUSY;
+	}
+
+	if (!(sb->s_flags & MS_WHITEOUT)) {
+		printk(KERN_INFO "union mount: whiteouts not supported by fs\n");
+		return -EINVAL;
+	}
+
+	if (!(sb->s_flags & MS_FALLTHRU)) {
+		printk(KERN_INFO "union mount: fallthrus not supported by fs\n");
+		return -EINVAL;
+	}
+
+	return 0;
+#endif
+}
+
+void put_union_sb(struct super_block *sb)
+{
+	if (unlikely(sb->s_union_lower_mnts)) {
+		drop_collected_mounts(sb->s_union_lower_mnts);
+		sb->s_union_lower_mnts = NULL;
+		sb->s_union_count = 0;
+	}
+}
+
+/**
+ * clone_union_tree - Clone all union-able mounts at this mountpoint
+ * @topmost: vfsmount of topmost layer
+ * @mntpnt: target of union mount
+ *
+ * Given the target mountpoint of a union mount, clone all the mounts at that
+ * mountpoint (well, pathname) that qualify as a union lower layer.  Increment
+ * the hard readonly count of the lower layer superblocks.
+ *
+ * Returns error if any of the mounts or submounts mounted on or below this
+ * pathname are unsuitable for union mounting.  This means you can't construct
+ * a union mount at the root of an existing mount without unioning it.
+ *
+ * XXX - Maybe should take # of layers to go down as an argument. But how to
+ * pass this in through mount options?  All solutions look ugly.  Currently you
+ * express your intention through mounting file systems on the same mountpoint,
+ * which is pretty elegant.
+ */
+static int clone_union_tree(struct mount *topmost, struct path *mntpnt)
+{
+	struct mount *mnt, *cloned_tree;
+
+	if (!IS_ROOT(mntpnt->dentry)) {
+		printk(KERN_INFO "union mount: mount point must be a root dir\n");
+		return -EINVAL;
+	}
+
+	/* Look for the "lowest" layer to union. */
+	mnt = real_mount(mntpnt->mnt);
+	while (mnt->mnt_parent->mnt.mnt_root == mnt->mnt_mountpoint) {
+		/* Got root (mnt)? */
+		if (mnt->mnt_parent == mnt)
+			break;
+		mnt = mnt->mnt_parent;
+	}
+
+	/* Clone all the read-only mounts and submounts, only if they
+	 * are not shared or slave, and increment the hard read-only
+	 * users count on each one.  If this can't be done for every
+	 * mount and submount below this one, fail.
+	 */
+	cloned_tree = copy_tree(mnt, mnt->mnt.mnt_root,
+				CL_COPY_ALL | CL_PRIVATE |
+				CL_NO_SHARED | CL_NO_SLAVE |
+				CL_MAKE_HARD_READONLY);
+	if (IS_ERR(cloned_tree))
+		return PTR_ERR(cloned_tree);
+
+	topmost->mnt.mnt_sb->s_union_lower_mnts = &cloned_tree->mnt;
+	return 0;
+}
+
+/**
+ * build_root_union - Create the union stack for the root dir
+ * @topmost_mnt - vfsmount of topmost mount
+ *
+ * Build the union stack for the root dir.  Annoyingly, we have to traverse
+ * union "up" from the root of the cloned tree to find the topmost read-only
+ * mount, and then traverse back "down" to build the stack.
+ */
+static int build_root_union(struct mount *topmost_mnt)
+{
+	struct mount *mnt, *topmost_ro_mnt;
+	struct path lower, topmost_path;
+	unsigned int i, layers = 1;
+	int err = 0;
+
+	/* Find the topmost read-only mount */
+	topmost_ro_mnt = real_mount(topmost_mnt->mnt.mnt_sb->s_union_lower_mnts);
+	for (mnt = topmost_ro_mnt; mnt; mnt = next_mnt(mnt, topmost_ro_mnt)) {
+		if (mnt->mnt_parent == topmost_ro_mnt &&
+		    mnt->mnt_mountpoint == topmost_ro_mnt->mnt.mnt_root) {
+			topmost_ro_mnt = mnt;
+			layers++;
+		}
+	}
+	topmost_mnt->mnt.mnt_sb->s_union_count = layers;
+
+	// SHOULD USE collect_mounts() here rather than merely mntgetting
+
+	/* Build the root dir's union stack from the top down */
+	topmost_path.mnt = &topmost_mnt->mnt;
+	topmost_path.dentry = topmost_mnt->mnt.mnt_root;
+	mnt = topmost_ro_mnt;
+	for (i = 0; i < layers; i++) {
+		lower.mnt = mntget(&mnt->mnt); // !!!!!!!!!! TODO: FIX
+		lower.dentry = dget(mnt->mnt.mnt_root);
+		err = union_add_dir(&topmost_path, &lower, i);
+		if (err)
+			goto out;
+		mnt = mnt->mnt_parent;
+	}
+	return 0;
+
+out:
+	d_free_unions(topmost_path.dentry);
+	topmost_mnt->mnt.mnt_sb->s_union_count = 0;
+	return err;
+}
+
+/**
+ * prepare_mnt_union - do setup necessary for a union mount
+ * @topmost_mnt: vfsmount of topmost layer
+ * @mntpnt: path of requested mountpoint
+ *
+ * We union every underlying file system that is mounted on the same mountpoint
+ * (well, pathname), read-only, and not shared.  If we get at least one layer,
+ * we don't return an error, although we will complain in the kernel log if we
+ * hit a mount that can't be unioned.
+ *
+ * Caller needs namespace_sem, but can't have vfsmount_lock.
+ */
+static int prepare_mnt_union(struct mount *topmost_mnt, struct path *mntpnt)
+{
+	int err;
+
+	if (d_unlinked(mntpnt->dentry))
+		return -ENOENT;
+
+	printk("UNION: prepare\n");
+
+	err = check_topmost_union_mnt(topmost_mnt, topmost_mnt->mnt.mnt_flags);
+	if (err)
+		return err;
+
+	err = clone_union_tree(topmost_mnt, mntpnt);
+	if (err)
+		return err;
+
+	err = build_root_union(topmost_mnt);
+	if (err)
+		goto out;
+
+	printk("UNION: prepared\n");
+	return 0;
+
+out:
+	put_union_sb(topmost_mnt->mnt.mnt_sb);
+	return err;
+}
+
+static void cleanup_mnt_union(struct mount *topmost_mnt)
+{
+	d_free_unions(topmost_mnt->mnt.mnt_root);
+	put_union_sb(topmost_mnt->mnt.mnt_sb);
+}
+
 /*
  *  @source_mnt : mount tree to be attached
  *  @nd         : place the mount tree @source_mnt is attached
@@ -1788,37 +1984,56 @@
 static int do_loopback(struct path *path, const char *old_name,
 				int recurse)
 {
-	struct path old_path;
+	struct path old_path, lower_cache, actual;
 	struct mount *mnt = NULL, *old, *parent;
 	struct mountpoint *mp;
+	struct inode *inode;
 	int err;
 	if (!old_name || !*old_name)
 		return -EINVAL;
-	err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
+
+	err = user_path_at(AT_FDCWD, old_name,
+			   LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT, &old_path);
 	if (err)
 		return err;
 
+	inode = union_get_inode(&old_path, &lower_cache, &actual);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out;
+	}
+
 	err = -EINVAL;
-	if (mnt_ns_loop(old_path.dentry))
-		goto out; 
+	if (mnt_ns_loop(inode))
+		goto out_lower; 
 
 	mp = lock_mount(path);
 	err = PTR_ERR(mp);
 	if (IS_ERR(mp))
-		goto out;
+		goto out_lower;
 
 	old = real_mount(old_path.mnt);
-	parent = real_mount(path->mnt);
 
 	err = -EINVAL;
 	if (IS_MNT_UNBINDABLE(old))
-		goto out2;
+		goto out_unlock;
 
+	/* If we're bind-mounting a file that's on a lower fs in a union then
+	 * we must first copy the file up as the copied mount stack attached to
+	 * the superblock is independent of any namespace and will fail the
+	 * check_mnt() test.  Directories are copied up during the pathwalk, so
+	 * we need not worry about those.
+	 */
+	err = union_copy_up(&old_path, &actual);
+	if (err < 0)
+		goto out_unlock;
+
+	parent = real_mount(path->mnt);
 	if (!check_mnt(parent) || !check_mnt(old))
-		goto out2;
+		goto out_unlock;
 
 	if (!recurse && has_locked_children(old, old_path.dentry))
-		goto out2;
+		goto out_unlock;
 
 	if (recurse)
 		mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);
@@ -1827,7 +2042,7 @@
 
 	if (IS_ERR(mnt)) {
 		err = PTR_ERR(mnt);
-		goto out2;
+		goto out_unlock;
 	}
 
 	mnt->mnt.mnt_flags &= ~MNT_LOCKED;
@@ -1838,8 +2053,10 @@
 		umount_tree(mnt, 0);
 		unlock_mount_hash();
 	}
-out2:
+out_unlock:
 	unlock_mount(mp);
+out_lower:
+	path_put_maybe(&lower_cache);
 out:
 	path_put(&old_path);
 	return err;
@@ -1880,6 +2097,18 @@
 	if (!check_mnt(mnt))
 		return -EINVAL;
 
+	if ((path->mnt->mnt_flags & MNT_UNION) &&
+	    !(mnt_flags & MNT_UNION))
+		return -EINVAL;
+
+	if ((mnt_flags & MNT_UNION) &&
+	    !(path->mnt->mnt_flags & MNT_UNION))
+		return -EINVAL;
+
+	if ((path->mnt->mnt_flags & MNT_UNION) &&
+	    (mnt_flags & MNT_READONLY))
+		return -EINVAL;
+
 	if (path->dentry != path->mnt->mnt_root)
 		return -EINVAL;
 
@@ -2015,6 +2244,7 @@
 {
 	struct mountpoint *mp;
 	struct mount *parent;
+	bool unioned = false;
 	int err;
 
 	mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED | MNT_SYNC_UMOUNT);
@@ -2045,7 +2275,17 @@
 		goto unlock;
 
 	newmnt->mnt.mnt_flags = mnt_flags;
+
+	if (IS_MNT_UNION(&newmnt->mnt)) {
+		err = prepare_mnt_union(newmnt, path);
+		if (err)
+			goto unlock;
+		unioned = true;
+	}
+
 	err = graft_tree(newmnt, parent, mp);
+	if (err < 0 && unioned)
+		cleanup_mnt_union(newmnt);
 
 unlock:
 	unlock_mount(mp);
diff --git a/fs/open.c b/fs/open.c
index 5c30ce3..1b48281 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -33,6 +33,7 @@
 #include <linux/compat.h>
 
 #include "internal.h"
+#include "union.h"
 
 int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 	struct file *filp)
@@ -65,29 +66,55 @@
 
 long vfs_truncate(struct path *path, loff_t length)
 {
+	struct path lower_cache, actual;
 	struct inode *inode;
 	long error;
 
-	inode = path->dentry->d_inode;
+	if (IS_PATH_UNIONED(path))
+		printk("UNION: truncate: path.mnt: at upper\n");
+
+	inode = union_get_inode(path, &lower_cache, &actual);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
 
 	/* For directories it's -EISDIR, for other non-regulars - -EINVAL */
+	error = -EISDIR;
 	if (S_ISDIR(inode->i_mode))
-		return -EISDIR;
+		goto out;
+	error = -EINVAL;
 	if (!S_ISREG(inode->i_mode))
-		return -EINVAL;
+		goto out;
 
 	error = mnt_want_write(path->mnt);
 	if (error)
 		goto out;
 
-	error = inode_permission(inode, MAY_WRITE);
-	if (error)
-		goto mnt_drop_write_and_out;
+	if (unlikely(d_is_unioned(path->dentry, &actual))) {
+		/* We have to be able to write to the upperfs. */
+		error = -EROFS;
+		if (path->dentry->d_sb->s_flags & MS_RDONLY)
+			goto mnt_drop_write_and_out;
+
+		/* But the lowerfs inode must offer write permission - if the
+		 * lowerfs was mounted writably. */
+		error = __inode_permission(inode, MAY_WRITE);
+		if (error)
+			goto mnt_drop_write_and_out;
+	} else {
+		error = inode_permission(inode, MAY_WRITE);
+		if (error)
+			goto mnt_drop_write_and_out;
+	}
 
 	error = -EPERM;
 	if (IS_APPEND(inode))
 		goto mnt_drop_write_and_out;
 
+	error = union_truncated_copy_up(path, &actual, &length);
+	if (error)
+		goto mnt_drop_write_and_out;
+
+	inode = path->dentry->d_inode;
 	error = get_write_access(inode);
 	if (error)
 		goto mnt_drop_write_and_out;
@@ -111,6 +138,7 @@
 mnt_drop_write_and_out:
 	mnt_drop_write(path->mnt);
 out:
+	path_put_maybe(&lower_cache);
 	return error;
 }
 EXPORT_SYMBOL_GPL(vfs_truncate);
@@ -332,7 +360,7 @@
 	if (res)
 		goto out;
 
-	inode = path.dentry->d_inode;
+	inode = d_inode_or_lower(path.dentry);
 
 	if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
 		/*
@@ -344,7 +372,15 @@
 			goto out_path_release;
 	}
 
-	res = inode_permission(inode, mode | MAY_ACCESS);
+	/* For unionmount files, we need to check the permissions on the upper
+	 * superblock and the lower inode.
+	 */
+	res = sb_permission(path.dentry->d_sb, inode, mode);
+	if (res != 0)
+		goto out_path_release;
+
+	res = __inode_permission(inode, mode | MAY_ACCESS);
+
 	/* SuS v2 requires we report a read only fs too */
 	if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
 		goto out_path_release;
@@ -464,19 +500,32 @@
 
 static int chmod_common(struct path *path, umode_t mode)
 {
-	struct inode *inode = path->dentry->d_inode;
-	struct inode *delegated_inode = NULL;
+	struct inode *inode, *delegated_inode = NULL;
+	struct path lower_cache, actual;
 	struct iattr newattrs;
 	int error;
 
+	inode = union_get_inode(path, &lower_cache, &actual);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
 	error = mnt_want_write(path->mnt);
 	if (error)
-		return error;
+		goto out_lower;
 retry_deleg:
+again:
 	mutex_lock(&inode->i_mutex);
-	error = security_path_chmod(path, mode);
+	error = security_path_chmod(&actual, mode);
 	if (error)
 		goto out_unlock;
+	if (d_is_unioned(path->dentry, &actual)) {
+		mutex_unlock(&inode->i_mutex);
+		error = union_copy_up(path, &actual);
+		if (error < 0)
+			goto out_drop_write;
+		inode = actual.dentry->d_inode;
+		goto again;
+	}
 	newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
 	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
 	error = notify_change(path->dentry, &newattrs, &delegated_inode);
@@ -487,7 +536,10 @@
 		if (!error)
 			goto retry_deleg;
 	}
+out_drop_write:
 	mnt_drop_write(path->mnt);
+out_lower:
+	path_put_maybe(&lower_cache);
 	return error;
 }
 
@@ -498,7 +550,10 @@
 
 	if (f.file) {
 		audit_inode(NULL, f.file->f_path.dentry, 0);
-		err = chmod_common(&f.file->f_path, mode);
+		if (f.file->f_inode != f.file->f_path.dentry->d_inode)
+			err = -EACCES; /* Unioned, but can't copy up. */
+		else
+			err = chmod_common(&f.file->f_path, mode);
 		fdput(f);
 	}
 	return err;
@@ -509,6 +564,7 @@
 	struct path path;
 	int error;
 	unsigned int lookup_flags = LOOKUP_FOLLOW;
+
 retry:
 	error = user_path_at(dfd, filename, lookup_flags, &path);
 	if (!error) {
@@ -529,8 +585,8 @@
 
 static int chown_common(struct path *path, uid_t user, gid_t group)
 {
-	struct inode *inode = path->dentry->d_inode;
-	struct inode *delegated_inode = NULL;
+	struct path lower_cache, actual;
+	struct inode *inode, *delegated_inode = NULL;
 	int error;
 	struct iattr newattrs;
 	kuid_t uid;
@@ -542,31 +598,54 @@
 	newattrs.ia_valid =  ATTR_CTIME;
 	if (user != (uid_t) -1) {
 		if (!uid_valid(uid))
-			return -EINVAL;
+			goto einval;
 		newattrs.ia_valid |= ATTR_UID;
 		newattrs.ia_uid = uid;
 	}
 	if (group != (gid_t) -1) {
 		if (!gid_valid(gid))
-			return -EINVAL;
+			goto einval;
 		newattrs.ia_valid |= ATTR_GID;
 		newattrs.ia_gid = gid;
 	}
+
+	inode = union_get_inode(path, &lower_cache, &actual);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
 	if (!S_ISDIR(inode->i_mode))
 		newattrs.ia_valid |=
 			ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
 retry_deleg:
+again:
 	mutex_lock(&inode->i_mutex);
 	error = security_path_chown(path, uid, gid);
-	if (!error)
-		error = notify_change(path->dentry, &newattrs, &delegated_inode);
+	if (error < 0)
+		goto error;
+
+	if (d_is_unioned(path->dentry, &actual)) {
+		mutex_unlock(&inode->i_mutex);
+		error = union_copy_up(path, &actual);
+		if (error < 0)
+			goto error;
+		inode = actual.dentry->d_inode;
+		goto again;
+	}
+
+	error = notify_change(path->dentry, &newattrs, &delegated_inode);
 	mutex_unlock(&inode->i_mutex);
 	if (delegated_inode) {
 		error = break_deleg_wait(&delegated_inode);
 		if (!error)
 			goto retry_deleg;
 	}
+error:
+	mutex_unlock(&inode->i_mutex);
+	path_put_maybe(&lower_cache);
 	return error;
+
+einval:
+	path_put_maybe(&lower_cache);
+	return -EINVAL;
 }
 
 SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
@@ -620,6 +699,11 @@
 	if (!f.file)
 		goto out;
 
+	if (f.file->f_inode != f.file->f_path.dentry->d_inode) {
+		error = -EACCES; /* Unioned, but can't copy up. */
+		goto out_fput;
+	}
+
 	error = mnt_want_write_file(f.file);
 	if (error)
 		goto out_fput;
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index bafb5a3..5ad8ad0 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -65,6 +65,8 @@
 		{ MNT_NOATIME, ",noatime" },
 		{ MNT_NODIRATIME, ",nodiratime" },
 		{ MNT_RELATIME, ",relatime" },
+		{ MNT_SHARED, ",shared" },
+		{ MNT_UNBINDABLE, ",nobind" },
 		{ MNT_UNION, ",union" },
 		{ 0, NULL }
 	};
diff --git a/fs/readdir.c b/fs/readdir.c
index 483bd75..ebe5665 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -20,6 +20,8 @@
 
 #include <asm/uaccess.h>
 
+#include "union.h"
+
 int iterate_dir(struct file *file, struct dir_context *ctx)
 {
 	struct inode *inode = file_inode(file);
@@ -31,9 +33,27 @@
 	if (res)
 		goto out;
 
-	res = mutex_lock_killable(&inode->i_mutex);
-	if (res)
-		goto out;
+	if (unlikely(IS_DIR_UNIONED(file->f_path.dentry)) &&
+	    !IS_OPAQUE(file->f_path.dentry->d_inode)) {
+		res = mnt_want_write(file->f_path.mnt);
+		if (res < 0)
+			goto out;
+
+		res = mutex_lock_killable(&inode->i_mutex);
+		if (res < 0) {
+			mnt_drop_write(file->f_path.mnt);
+			goto out;
+		}
+
+		res = union_copy_up_dir(&file->f_path);
+		mnt_drop_write(file->f_path.mnt);
+		if (res < 0)
+			goto out_unlock;
+	} else {
+		res = mutex_lock_killable(&inode->i_mutex);
+		if (res)
+			goto out;
+	}
 
 	res = -ENOENT;
 	if (!IS_DEADDIR(inode)) {
@@ -42,6 +62,7 @@
 		file->f_pos = ctx->pos;
 		file_accessed(file);
 	}
+out_unlock:
 	mutex_unlock(&inode->i_mutex);
 out:
 	return res;
diff --git a/fs/stat.c b/fs/stat.c
index ae0c3ce..909d3cb 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -17,6 +17,7 @@
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
+#include "union.h"
 
 void generic_fillattr(struct inode *inode, struct kstat *stat)
 {
@@ -51,25 +52,50 @@
  */
 int vfs_getattr_nosec(struct path *path, struct kstat *stat)
 {
-	struct inode *inode = path->dentry->d_inode;
+	struct path lower_cache, actual;
+	struct inode *inode;
+	int retval;
 
-	if (inode->i_op->getattr)
-		return inode->i_op->getattr(path->mnt, path->dentry, stat);
+	inode = union_get_inode(path, &lower_cache, &actual);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	if (inode->i_op->getattr) {
+		retval = inode->i_op->getattr(actual.mnt, actual.dentry, stat);
+		goto out;
+	}
 
 	generic_fillattr(inode, stat);
-	return 0;
+out:
+	path_put_maybe(&lower_cache);
+	return retval;
 }
 
 EXPORT_SYMBOL(vfs_getattr_nosec);
 
 int vfs_getattr(struct path *path, struct kstat *stat)
 {
+	struct path lower_cache, actual;
+	struct inode *inode;
 	int retval;
 
-	retval = security_inode_getattr(path->mnt, path->dentry);
+	inode = union_get_inode(path, &lower_cache, &actual);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	retval = security_inode_getattr(actual.mnt, actual.dentry);
 	if (retval)
-		return retval;
-	return vfs_getattr_nosec(path, stat);
+		goto out;
+
+	if (inode->i_op->getattr) {
+		retval = inode->i_op->getattr(actual.mnt, actual.dentry, stat);
+		goto out;
+	}
+
+	generic_fillattr(inode, stat);
+out:
+	path_put_maybe(&lower_cache);
+	return retval;
 }
 
 EXPORT_SYMBOL(vfs_getattr);
@@ -326,7 +352,12 @@
 retry:
 	error = user_path_at_empty(dfd, pathname, lookup_flags, &path, &empty);
 	if (!error) {
-		struct inode *inode = path.dentry->d_inode;
+		struct inode *inode = d_inode_or_lower(path.dentry);
+
+		if (IS_MNT_UNION(path.mnt)) {
+			printk("readlink inode: %p -> %p [%x %d]\n",
+			       path.dentry->d_inode, inode, path.dentry->d_flags, empty);
+		}
 
 		error = empty ? -ENOENT : -EINVAL;
 		if (inode->i_op->readlink) {
diff --git a/fs/super.c b/fs/super.c
index 4c9a2fe..8a67018 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -283,6 +283,7 @@
 		unregister_shrinker(&s->s_shrink);
 
 		put_filesystem(fs);
+		put_union_sb(s);
 		put_super(s);
 	} else {
 		up_write(&s->s_umount);
diff --git a/fs/union.c b/fs/union.c
new file mode 100644
index 0000000..0188bf1
--- /dev/null
+++ b/fs/union.c
@@ -0,0 +1,931 @@
+/* VFS-based union mounts for Linux
+ *
+ * Copyright (C) 2004-2007 IBM Corporation, IBM Deutschland Entwicklung GmbH.
+ * Copyright (C) 2007-2009 Novell Inc.
+ * Copyright (C) 2009-2012 Red Hat, Inc.
+ *
+ *   Author(s): Jan Blunck (j.blunck@tu-harburg.de)
+ *              Valerie Aurora <vaurora@redhat.com>
+ *              David Howells <dhowells@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#define DEBUG
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/fs_struct.h>
+#include <linux/slab.h>
+#include <linux/fsnotify.h>
+#include <linux/xattr.h>
+#include <linux/file.h>
+#include <linux/security.h>
+#include <linux/splice.h>
+#include <linux/ratelimit.h>
+
+#include "internal.h"
+#include "union.h"
+
+/**
+ * d_free_unions - free all unions for this dentry
+ * @dentry: topmost dentry in the union stack to remove
+ *
+ * This must be called when freeing a dentry.  d_inode may point to a defunct
+ * inode or may have been cleared by the time we get here.
+ */
+void d_free_unions(struct dentry *topmost)
+{
+	struct path *path;
+	unsigned int i, layers = topmost->d_sb->s_union_count;
+
+	if (topmost->d_union_stack) {
+		if (topmost->d_flags & DCACHE_UNION_PINNING_LOWER) {
+			/* A negative non-dir upper dentry is pinning
+			 * a single lower dentry so that f_inode
+			 * doesn't have to.
+			 */
+			printk("free pin: %pd\n", topmost);
+			dput(topmost->d_fallthru);
+		} else {
+			/* A positive directory dentry is pinning a
+			 * stack of lower dirs.
+			 */
+			printk("free dirstack: %pd\n", topmost);
+
+			for (i = 0; i < layers; i++) {
+				path = union_find_dir(topmost, i);
+				if (path->mnt)
+					path_put(path);
+			}
+			kfree(topmost->d_union_stack);
+		}
+		topmost->d_union_stack = NULL;
+	}
+}
+
+/**
+ * union_add_dir - Add another layer to a unioned directory
+ * @topmost: topmost directory
+ * @lower: directory in the current layer
+ * @layer: index of layer to add this at
+ *
+ * @layer counts starting at 0 for the dir below the topmost dir.
+ *
+ * This transfers the caller's references to the constituents of *lower to the
+ * union stack.
+ */
+int union_add_dir(struct path *topmost, struct path *lower, unsigned layer)
+{
+	struct dentry *dentry = topmost->dentry;
+	struct path *path;
+
+	BUG_ON(layer >= dentry->d_sb->s_union_count);
+	BUG_ON(d_is_fallthru(dentry));
+
+	if (!dentry->d_union_stack)
+		dentry->d_union_stack = union_alloc_stack(topmost);
+	if (!dentry->d_union_stack)
+		return -ENOMEM;
+
+	path = union_find_dir(dentry, layer);
+	*path = *lower;
+	return 0;
+}
+
+/**
+ * union_copy_up_xattr
+ * @new: dentry of new copy
+ * @old: dentry of original file
+ *
+ * Copy up extended attributes from the original file to the new one.
+ *
+ * XXX - Permissions?  For now, copying up every xattr.
+ */
+static int union_copy_up_xattr(struct path *new, struct dentry *old)
+{
+	ssize_t list_size, size;
+	char *buf, *name, *value;
+	int error;
+
+	/* Check for xattr support */
+	if (!old->d_inode->i_op->getxattr ||
+	    !new->dentry->d_inode->i_op->getxattr)
+		return 0;
+
+	/* Find out how big the list of xattrs is */
+	list_size = vfs_listxattr(old, NULL, 0);
+	if (list_size <= 0)
+		return list_size;
+
+	/* Allocate memory for the list */
+	buf = kzalloc(list_size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	/* Allocate memory for the xattr's value */
+	error = -ENOMEM;
+	value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL);
+	if (!value)
+		goto out;
+
+	/* Actually get the list of xattrs */
+	list_size = vfs_listxattr(old, buf, list_size);
+	if (list_size <= 0) {
+		error = list_size;
+		goto out_free_value;
+	}
+
+	for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
+		/* XXX Locking? old is on read-only fs */
+		size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX);
+		if (size <= 0) {
+			error = size;
+			goto out_free_value;
+		}
+		/* XXX do we really need to check for size overflow? */
+		/* XXX locks new dentry, lock ordering problems? */
+		error = vfs_setxattr(new, name, value, size, 0);
+		if (error)
+			goto out_free_value;
+	}
+
+out_free_value:
+	kfree(value);
+out:
+	kfree(buf);
+	return error;
+}
+
+/**
+ * union_create_topmost_dir - Create a matching dir in the topmost file system
+ * @parent - parent of target on topmost layer
+ * @topmost - path of target on topmost layer
+ * @d - stack of source directories in lower layers
+ *
+ * As we lookup each directory on the lower layer of a union, we create a
+ * matching directory on the topmost layer if it does not already exist.
+ *
+ * We don't use vfs_mkdir() for a few reasons: don't want to do the security
+ * check, don't want to make the dir opaque, don't need to sanitize the mode.
+ *
+ * The caller must hold the parent i_mutex lock and the mnt_want_write lock.
+ *
+ * XXX - owner is wrong, set credentials properly
+ * XXX - rmdir() directory on failure of xattr copyup
+ * XXX - not atomic w/ respect to crash
+ */
+int union_create_topmost_dir(struct path *parent,
+			     struct path *topmost,
+			     struct union_stack *d)
+{
+	struct dentry *lower;
+	struct inode *dir = parent->dentry->d_inode;
+	unsigned i, layers = parent->dentry->d_sb->s_union_count;
+	int error;
+
+	BUG_ON(topmost->dentry->d_inode);
+
+	for (i = 0; i < layers; i++)
+		if ((lower = d->u_dirs[i].dentry))
+			break;
+
+	/* XXX - Do we even need to check this? */
+	if (!dir->i_op->mkdir)
+		return -EPERM;
+
+	error = dir->i_op->mkdir(dir, topmost->dentry, lower->d_inode->i_mode);
+	if (error)
+		return error;
+
+	error = union_copy_up_xattr(topmost, lower);
+	if (error)
+		goto out_rmdir;
+
+	fsnotify_mkdir(dir, topmost->dentry);
+	return 0;
+
+out_rmdir:
+	/* XXX rm created dir */
+	dput(topmost->dentry);
+	return error;
+}
+
+struct union_iterate_context {
+	struct dir_context ctx;
+	struct dentry *topmost_dentry;
+	int error;
+};
+
+/**
+ * union_copy_up_one_dirent - copy up a single directory entry
+ *
+ * Individual directory entry copyup function for union_copy_up_dir.
+ * We get the entries from higher level layers first.
+ */
+static int union_copy_up_one_dirent(void *buf, const char *name, int namelen,
+				    loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct union_iterate_context *uic = (struct union_iterate_context *)buf;
+	struct dentry *topmost_dentry = uic->topmost_dentry;
+	struct dentry *dentry;
+	int err = 0;
+
+	switch (namelen) {
+	case 2:
+		if (name[1] != '.')
+			break;
+	case 1:
+		if (name[0] != '.')
+			break;
+		return 0;
+	}
+
+	/* Lookup this entry in the topmost directory */
+	dentry = lookup_one_len(name, topmost_dentry, namelen);
+
+	if (IS_ERR(dentry)) {
+		printk(KERN_WARNING "%s: error looking up %*.*s\n",
+		       __func__, namelen, namelen, name);
+		err = PTR_ERR(dentry);
+		goto out;
+	}
+
+	/* XXX do we need to revalidate on readdir anyway? think NFS */
+	if (dentry->d_op && dentry->d_op->d_revalidate)
+		goto fallthru;
+
+	/* If the entry already exists, one of the following is true: it was
+	 * already copied up (due to an earlier lookup), an entry with the same
+	 * name already exists on the topmost file system, it is a whiteout, or
+	 * it is a fallthru.  In each case, the top level entry masks any
+	 * entries from lower file systems, so don't copy up this entry.
+	 */
+	if (dentry->d_inode || d_is_whiteout(dentry) || d_is_fallthru(dentry))
+		goto out_dput;
+
+	/* If the entry doesn't exist, create a fallthru entry in the topmost
+	 * file system.  All possible directory types are used, so each file
+	 * system must implement its own way of storing a fallthru entry.
+	 */
+fallthru:
+	err = topmost_dentry->d_inode->i_op->fallthru(topmost_dentry->d_inode,
+						      dentry);
+
+	/* It's okay if it exists, ultimate responsibility rests with
+	 * ->fallthru() */
+	if (err == -EEXIST)
+		err = 0;
+out_dput:
+	dput(dentry);
+out:
+	if (err)
+		uic->error = err;
+	return err;
+}
+
+/**
+ * __union_copy_up_dir - Non-recursive directory copy up
+ *
+ * Copy up the specified directory only, without recursing into the subtree
+ * rooted at this point.
+ *
+ * During the operation, where a directory entry exists in one of the lower
+ * directories, a fallthrough dentry will be created in the upper directory if
+ * the upper directory doesn't already have an entry that obscures it.  At the
+ * end of the operation, the upper directory will be marked opaque on the
+ * medium - thus preventing further copy up attempts on this directory.
+ *
+ * TODO: At some point in the future, on-medium whiteouts should be culled from
+ * a directory that is marked opaque as they then serve no purpose.
+ *
+ * The primary reason for this function is that readdir() is difficult to
+ * support on union file systems for two reasons: We must eliminate duplicates
+ * and apply whiteouts, and we must return something in f_pos that lets us
+ * restart in the same place when we return.  Our solution is to, on first
+ * readdir() of the directory, copy up all visible entries from the low-level
+ * file systems and mark the entries that refer to low-level file system
+ * objects as "fallthrough" entries.
+ *
+ * Sadly, this function is also necessary for rmdir().  To work out whether a
+ * directory is empty, we have to work out if there are entries in lower
+ * directories that are not obscured by whiteouts in the upper.  This is not a
+ * trivial operation.  The simplest way is, therefore, to copy up and then
+ * check the combined opaque directory.
+ *
+ *
+ * Locking strategy: We hold the topmost dir's i_mutex on entry.  We grab the
+ * i_mutex on lower directories one by one.  So the locking order is:
+ *
+ *	Writable/topmost layers > Read-only/lower layers
+ *
+ * So there is no problem with lock ordering for union stacks with
+ * multiple lower layers.  E.g.:
+ *
+ *	(topmost) A->B->C (bottom)
+ *	(topmost) D->C->B (bottom)
+ *
+ */
+int __union_copy_up_dir(struct path *topmost_path)
+{
+	struct dentry *topmost_dentry = topmost_path->dentry;
+	unsigned int i, layers = topmost_dentry->d_sb->s_union_count;
+	int error = 0;
+
+	struct union_iterate_context uic = {
+		.ctx.actor = union_copy_up_one_dirent,
+		.topmost_dentry = topmost_dentry,
+	};
+
+
+	if (IS_OPAQUE(topmost_dentry->d_inode))
+		return 0;
+
+	if (!topmost_dentry->d_inode->i_op ||
+	    !topmost_dentry->d_inode->i_op->fallthru)
+		return -EOPNOTSUPP;
+
+	for (i = 0; i < layers; i++) {
+		struct inode *inode;
+		struct file *ftmp;
+		struct path *path;
+
+		path = union_find_dir(topmost_dentry, i);
+		if (!path->mnt)
+			continue;
+
+		ftmp = dentry_open(path, O_RDONLY | O_DIRECTORY | O_NOATIME,
+				   current_cred());
+		if (IS_ERR(ftmp)) {
+			printk(KERN_ERR "unable to open dir %pd for "
+			       "directory copyup: %ld\n",
+			       path->dentry, PTR_ERR(ftmp));
+			error = PTR_ERR(ftmp);
+			break;
+		}
+
+		inode = file_inode(ftmp);
+		mutex_lock(&inode->i_mutex);
+
+		error = -ENOENT;
+		if (IS_DEADDIR(inode))
+			goto out_fput;
+
+		/* Read the whole directory, calling our directory entry copyup
+		 * function on each entry.
+		 */
+		uic.ctx.pos = 0;
+		uic.error = 0;
+		error = ftmp->f_op->iterate(ftmp, &uic.ctx);
+out_fput:
+		mutex_unlock(&inode->i_mutex);
+		fput(ftmp);
+
+		if (uic.error)
+			error = uic.error;
+		if (error)
+			break;
+
+		/* XXX Should process directories below an opaque directory in
+		 * case there are fallthrus in it
+		 */
+		if (IS_OPAQUE(path->dentry->d_inode))
+			break;
+	}
+
+	/* Mark this dir opaque to show that we have already copied up the
+	 * lower entries.  Be sure to do this AFTER the directory entries have
+	 * been copied up so that if we crash in the middle of copyup, we will
+	 * try to copyup the dir next time we read it.
+	 *
+	 * XXX - Could leave directory non-opaque, and force reread/copyup of
+	 * directory each time it is read in from disk.  That would make it
+	 * easy to update lower file systems (when not union mounted) and have
+	 * the changes show up when union mounted again.
+	 */
+	if (!error) {
+		topmost_dentry->d_inode->i_flags |= S_OPAQUE;
+		mark_inode_dirty(topmost_dentry->d_inode);
+	}
+
+	return error;
+}
+
+/* Relationship between i_mode and the DT_xxx types */
+static inline unsigned char dt_type(struct inode *inode)
+{
+	return (inode->i_mode >> 12) & 15;
+}
+
+/**
+ * generic_readdir_fallthru - Helper to lookup target of a fallthru
+ * @topmost_dentry: dentry for the topmost dentry of the dir being read
+ * @name: name of fallthru dirent
+ * @namelen: length of @name
+ * @ino: return inode number of target, if found
+ * @d_type: return directory type of target, if found
+ *
+ * In readdir(), client file systems need to lookup the target of a
+ * fallthru in a lower layer for three reasons: (1) fill in d_ino, (2)
+ * fill in d_type, (2) make sure there is something to fall through to
+ * (and if not, don't return this dentry).  Upon detecting a fallthru
+ * dentry in readdir(), the client file system should call this function.
+ *
+ * Returns 0 on success and -ENOENT if no matching directory entry was
+ * found (which can happen when the topmost file system is unmounted
+ * and remounted over a different file system than).  Any other errors
+ * are unexpected.
+ */
+int generic_readdir_fallthru(struct dentry *topmost_dentry, const char *name,
+			     int namlen, ino_t *ino, unsigned char *d_type)
+{
+	struct path *parent;
+	struct dentry *dentry;
+	unsigned int i, layers = topmost_dentry->d_sb->s_union_count;
+
+	BUG_ON(!mutex_is_locked(&topmost_dentry->d_inode->i_mutex));
+
+	for (i = 0; i < layers; i++) {
+		parent = union_find_dir(topmost_dentry, i);
+		mutex_lock(&parent->dentry->d_inode->i_mutex);
+		dentry = lookup_one_len(name, parent->dentry, namlen);
+		mutex_unlock(&parent->dentry->d_inode->i_mutex);
+		if (IS_ERR(dentry))
+			return PTR_ERR(dentry);
+		if (dentry->d_inode) {
+			*ino = dentry->d_inode->i_ino;
+			*d_type = dt_type(dentry->d_inode);
+			dput(dentry);
+			return 0;
+		}
+		dput(dentry);
+	}
+	return -ENOENT;
+}
+EXPORT_SYMBOL(generic_readdir_fallthru);
+
+/*
+ * Get the inode and path for a dentry where that inode may exist on a lower
+ * layer in a union.
+ *
+ * The caller must preclear the elements of *_lower_cache and prime *_actual
+ * with the contents of *upper (as is done by wrappers in union.h) and must
+ * also hold parent->i_mutex.
+ *
+ * Note that we don't get a ref on the inode or the lower vfsmount (if
+ * returned).  We leave it to the caller to iget/mntget them if appropriate.
+ * This should be safe as the caller holds parent->i_mutex.  The lower dentry
+ * (if returned) is dget'd, however.
+ *
+ * The pointers returned in *_actual are not dget'd/mntget'd as it is assumed
+ * they're pinned by the caller's ref on upper->mnt (if set), upper->dentry; or
+ * by the fact that parent->i_mutex is locked and _lower_cache->dentry is
+ * dget'd.
+ */
+struct inode *__union_get_inode_locked(struct dentry *parent,
+				       struct path *upper,
+				       struct path *_lower_cache,
+				       struct path *_actual)
+{
+	const struct union_stack *d;
+	struct dentry *dentry = upper->dentry;
+	unsigned i, layers = parent->d_sb->s_union_count;
+
+	pr_devel("-->%s(%pd,)\n", __func__, dentry);
+
+	BUG_ON(d_is_whiteout(dentry));
+
+	/* Check for a race with copy up. */
+	if (likely(dentry->d_inode)) {
+		pr_devel("<--%s() = upper\n", __func__);
+		*_actual = *upper;
+		return dentry->d_inode;
+	}
+
+	BUG_ON(!(dentry->d_flags & DCACHE_UNION_PINNING_LOWER));
+
+	pr_devel("<--%s() = fallthru\n", __func__);
+	smp_rmb();
+	_actual->dentry = dentry->d_fallthru;
+	d = parent->d_union_stack;
+	for (i = 0; i < layers; i++) {
+		if (d->u_dirs[i].dentry == dentry->d_fallthru->d_parent) {
+			_lower_cache->mnt = d->u_dirs[i].mnt;
+			break;
+		}
+	}
+	if (unlikely(!_lower_cache->mnt))
+		goto out_badcache;
+	_actual->mnt = mntget(_lower_cache->mnt);
+	return dentry->d_fallthru->d_inode;
+
+out_badcache:
+	printk_ratelimited(KERN_WARNING "UNION: Bad cached fallthru (%pd/%pd)\n",
+			   parent, upper->dentry);
+	return ERR_PTR(-EIO);
+}
+
+/*
+ * Get the inode for a dentry where that inode may exist on a lower layer in a
+ * union.
+ *
+ * Note that we don't get a ref on the inode, so we may need to pin it by
+ * getting a ref on a dentry pointing to it - in which case, a pointer to that
+ * dentry will be returned in *_lower and the caller is expected to dput() the
+ * ref on it.
+ */
+struct inode *__union_get_inode(struct path *upper, struct path *_lower_cache,
+				struct path *_actual)
+{
+	struct dentry *parent, *dentry = upper->dentry;
+	struct inode *inode;
+	int ret;
+
+	pr_devel("-->%s(%pd,)\n", __func__, dentry);
+
+	/* We need the parent directory so that we can find the stack of lower
+	 * directories in which to do lookups.  Use the rename mutex to prevent
+	 * rename from getting underfoot whilst we get the parent.
+	 */
+	if (mutex_lock_interruptible(&dentry->d_sb->s_vfs_rename_mutex) < 0)
+		return ERR_PTR(-EINTR);
+
+	parent = dget_parent(dentry);
+	if (IS_OPAQUE(parent->d_inode) && !d_is_fallthru(dentry)) {
+		mutex_unlock(&dentry->d_sb->s_vfs_rename_mutex);
+		inode = NULL;
+	} else {
+		ret = mutex_lock_interruptible(&parent->d_inode->i_mutex);
+		mutex_unlock(&dentry->d_sb->s_vfs_rename_mutex);
+		if (ret < 0) {
+			inode = ERR_PTR(ret);
+		} else {
+			inode = __union_get_inode_locked(parent, upper,
+							 _lower_cache, _actual);
+			mutex_unlock(&parent->d_inode->i_mutex);
+		}
+	}
+	dput(parent);
+	return inode;
+}
+
+/**
+ * union_create_file
+ * @parent: path of the upper parent directory
+ * @upper: path of the negative dentry to become new file
+ * @lower: path of the source file
+ *
+ * Must already have mnt_want_write() on the mnt and the parent's i_mutex.
+ */
+static int union_create_file(struct path *parent, struct path *upper,
+			     struct path *lower)
+{
+	struct inode *dir = parent->dentry->d_inode;
+	int ret;
+
+	if (!dir->i_op->tmpfile)
+		return -EPERM;
+
+	ret = dir->i_op->tmpfile(dir, upper->dentry,
+				 lower->dentry->d_inode->i_mode);
+	if (ret == 0) {
+		spin_lock(&upper->dentry->d_inode->i_lock);
+		upper->dentry->d_inode->i_state |= I_LINKABLE;
+		spin_unlock(&upper->dentry->d_inode->i_lock);
+	}
+	return ret;
+}
+
+/**
+ * union_create_symlink
+ * @parent: Upper parent of the symlink
+ * @upper: Path of the negative dentry to become new symlink.
+ * @lower: Path of the source symlink
+ *
+ * Must already have mnt_want_write() on the mnt and the parent's i_mutex.
+ */
+static int union_create_symlink(struct path *parent, struct path *upper,
+				struct path *lower)
+{
+	struct inode *inode = lower->dentry->d_inode;
+	char *content;
+	int error;
+
+	content = kmalloc(PATH_MAX + 2, GFP_KERNEL);
+	if (!content)
+		return -ENOMEM;
+
+	error = inode->i_op->readlink(lower->dentry, content, PATH_MAX + 1);
+	if (error < 0)
+		goto error;
+	content[error] = 0;
+
+	error = vfs_symlink(parent->dentry->d_inode, upper->dentry, content);
+error:
+	kfree(content);
+	return error;
+}
+
+/**
+ * union_copy_up_data - Copy up len bytes of old's data to new
+ * @path: path of target file
+ * @actual: path of source file in lower layer
+ * @truncate_to: number of bytes to copy (or NULL if all)
+ */
+static int union_copy_up_data(struct path *path, struct path *actual,
+			      const loff_t *truncate_to)
+{
+	const struct cred *cred = current_cred();
+	struct file *lower_file;
+	struct file *new_file;
+	loff_t filesize, loffset = 0, noffset = 0;
+	size_t len;
+	long bytes;
+	int error = 0;
+
+	filesize = i_size_read(actual->dentry->d_inode);
+	if (truncate_to && *truncate_to < filesize)
+		filesize = *truncate_to;
+
+	/* Check for overflow of file size */
+	len = filesize;
+	if (len != filesize)
+		return -EFBIG;
+
+	if (len == 0)
+		return 0;
+
+	lower_file = dentry_open(actual, O_RDONLY, cred);
+	if (IS_ERR(lower_file))
+		return PTR_ERR(lower_file);
+
+	new_file = dentry_open(path, O_WRONLY, cred);
+	if (IS_ERR(new_file)) {
+		error = PTR_ERR(new_file);
+		goto out_fput;
+	}
+
+	bytes = do_splice_direct(lower_file, &loffset,
+				 new_file, &noffset, len,
+				 SPLICE_F_MOVE);
+	if (bytes < 0)
+		error = bytes;
+
+	fput(new_file);
+out_fput:
+	fput(lower_file);
+	return error;
+}
+
+/*
+ * Create a temporary file.  We don't want to inline this as it uses quite a
+ * lot of stack space.
+ *
+ * The caller should make sure _tmpfile->mnt is set to the upper vfsmount and
+ * that ->dentry is NULL.
+ *
+ * Note: we don't return with a ref on _tmpfile->mnt as path is holding a ref.
+ * Further, we may return with a dentry in _tmpfile->dentry that needs
+ * dput'ing, even if an error occurred.
+ */
+static int union_create_tmpfile(struct path *parent, struct path *path,
+				struct path *actual, struct path *_tmpfile)
+{
+	static const struct qstr nameless = { .name = "", .len = 0, .hash = 0 };
+	struct dentry *dentry;
+	int ret;
+
+	pr_devel("-->%s(%pd)\n", __func__, path->dentry);
+
+	/* Create a nameless file not directly attached to the parent
+	 * directory, but still associated with it for layout optimisation
+	 * reasons.  The upperfs should check for the file being of zero
+	 * length.
+	 * 
+	 * We will then hard link the file into place when we're done copying
+	 * up - and mount/fsck will clean it up in the event of a crash and
+	 * dget() will clean it up in the event of an error.
+	 */
+	dentry = d_alloc(parent->dentry, &nameless);
+	if (!IS_ERR(dentry)) {
+		_tmpfile->dentry = dentry;
+		if (S_ISREG(actual->dentry->d_inode->i_mode))
+			ret = union_create_file(parent, _tmpfile, actual);
+		else if (S_ISLNK(actual->dentry->d_inode->i_mode))
+			ret = union_create_symlink(parent, _tmpfile, actual);
+		else
+			BUG();
+	} else {
+		ret = PTR_ERR(dentry);
+	}
+
+	pr_devel("<--%s() = %d\n", __func__, ret);
+	return ret;
+}
+
+/**
+ * Copy up a file or symlink to a temporary file in the specially prepared
+ * directory and return the dentry of that.
+ */
+static int union_copy_up_to_tmpfile(struct path *parent, struct path *path,
+				    struct path *actual, struct path *_tmpfile,
+				    const loff_t *truncate_to)
+{
+	struct dentry *dentry = actual->dentry;
+	int ret;
+
+	ret = union_create_tmpfile(parent, path, actual, _tmpfile);
+
+	if (ret == 0 && S_ISREG(dentry->d_inode->i_mode))
+		ret = union_copy_up_data(_tmpfile, actual, truncate_to);
+	if (ret == 0)
+		ret = union_copy_up_xattr(_tmpfile, actual->dentry);
+	return ret;
+}
+
+/*
+ * Create a hardlink from the temporary file to the actual location.
+ */
+static int union_hard_link_to_tmpfile(struct path *parent, struct path *path,
+				      struct path *tmpfile)
+{
+	int ret;
+
+	pr_devel("-->%s(%pd,%pd,%pd)\n",
+		 __func__, parent->dentry, path->dentry, tmpfile->dentry);
+
+	ret = vfs_link(tmpfile->dentry, parent->dentry->d_inode, path->dentry,
+		       NULL);
+	return ret;
+}
+
+/**
+ * union_copy_up_via_tmpfile - Copy up lower file via temporary file
+ *
+ * Copy up a file or symlink to a temporary file in the specially prepared
+ * directory, then hard link across and unlink the temp file.
+ */
+static int union_copy_up_via_tmpfile(struct path *parent, struct path *path,
+				     struct path *actual,
+				     const loff_t *truncate_to)
+{
+	const struct cred *saved_cred;
+	struct cred *override_cred;
+	struct path tmpfile = { .mnt = path->mnt, .dentry = NULL };
+	int ret;
+
+	pr_devel("-->%s(,%pd,%pd,%pd,,%lld)\n",
+		 __func__, parent->dentry, path->dentry, actual->dentry,
+		 truncate_to ? *truncate_to : -1);
+
+	override_cred = prepare_kernel_cred(NULL);
+	if (!override_cred)
+		return -ENOMEM;
+
+	override_cred->fsuid = actual->dentry->d_inode->i_uid;
+	override_cred->fsgid = actual->dentry->d_inode->i_gid;
+
+	saved_cred = override_creds(override_cred);
+
+	ret = union_copy_up_to_tmpfile(parent, path, actual, &tmpfile,
+				       truncate_to);
+
+	if (ret == 0)
+		ret = union_hard_link_to_tmpfile(parent, path, &tmpfile);
+
+	/* Discard the temporary dentry */
+	dput(tmpfile.dentry);
+
+	revert_creds(saved_cred);
+
+	put_cred(override_cred);
+	pr_devel("<--%s() = %d\n", __func__, ret);
+	return ret;
+}
+
+/**
+ * __union_copy_up - Copy a non-directory file up to the upper layer.
+ */
+int __union_copy_up(struct path *path, struct path *actual,
+		    const loff_t *truncate_to)
+{
+	struct dentry *upper = path->dentry;
+	struct path parent;
+	int ret;
+
+	pr_devel("-->%s(%pd)\n", __func__, path->dentry);
+
+	/* We don't currently support copyup of special files, though in theory
+	 * there's no reason we couldn't at least copy up blockdev and chrdev
+	 * files.  FIFO files are problematic if open.  Socket files are
+	 * managed by AF_UNIX and would need help from there.  Directories are
+	 * handled by pathwalk.
+	 */
+	if (!S_ISREG(actual->dentry->d_inode->i_mode) &&
+	    !S_ISLNK(actual->dentry->d_inode->i_mode))
+		return -EACCES;
+
+	parent.mnt = path->mnt;
+
+	/* We need to get the parent directory and then we need to lock it.
+	 * Use the rename mutex to prevent rename from getting underfoot whilst
+	 * we do this.
+	 */
+	if (mutex_lock_interruptible(&upper->d_sb->s_vfs_rename_mutex) < 0)
+		return -EINTR;
+
+	if (upper->d_inode) {
+		mutex_unlock(&upper->d_sb->s_vfs_rename_mutex);
+		goto already_copied_up;
+	}
+
+	parent.dentry = dget_parent(upper);
+	BUG_ON(IS_OPAQUE(parent.dentry->d_inode) && !d_is_fallthru(upper));
+	BUG_ON(d_is_whiteout(upper));
+
+	ret = mutex_lock_interruptible(&parent.dentry->d_inode->i_mutex);
+	mutex_unlock(&upper->d_sb->s_vfs_rename_mutex);
+	if (ret < 0) {
+		dput(parent.dentry);
+		goto out;
+	}
+
+	if (upper->d_inode)
+		goto already_copied_up_unlock;
+
+	/* Do the copy up */
+	ret = union_copy_up_via_tmpfile(&parent, path, actual, truncate_to);
+	mutex_unlock(&parent.dentry->d_inode->i_mutex);
+	dput(parent.dentry);
+
+out:
+	pr_devel("<--%s() = %d\n", __func__, ret);
+	return ret;
+
+already_copied_up_unlock:
+	mutex_unlock(&parent.dentry->d_inode->i_mutex);
+	dput(parent.dentry);
+already_copied_up:
+	pr_devel("<--%s() = 0 [already done]\n", __func__);
+	*actual = *path;
+	return 0;
+}
+
+/**
+ * __union_copy_up_for_do_last - Copy up a file for do_last()
+ * @parent: The parent directory of the file to be copied up.
+ * @path: The file to be copied up _to_.
+ * @will_truncate: Whether or not O_TRUNC is in force.
+ *
+ * Copy up for do_last().  It is expected that the caller will hold the
+ * want-write lock and will have called union_lookup_point*() first.
+ */
+int __union_copy_up_for_do_last(struct path *parent, struct path *path,
+				bool will_truncate)
+{
+	struct path lower_cache, actual;
+	struct inode *inode;
+	loff_t zero = 0;
+	int ret;
+
+	pr_devel("-->%s(,%pd{%pd},)\n",
+		 __func__, path->dentry,
+		 path->dentry->d_fallthru ? path->dentry->d_fallthru : NULL);
+
+	BUG_ON(!(path->dentry->d_flags & DCACHE_UNION_LOOKUP_DONE));
+	BUG_ON(!(path->dentry->d_flags & DCACHE_UNION_PINNING_LOWER));
+	BUG_ON(!path->dentry->d_fallthru);
+
+	ret = mutex_lock_interruptible(&parent->dentry->d_inode->i_mutex);
+	if (ret < 0)
+		return ret;
+
+	/* Check to see if we raced with another copy-up or an unlink */
+	ret = 0;
+	if (path->dentry->d_parent != parent->dentry ||
+	    path->dentry->d_inode)
+		goto unlock_out;
+
+	inode = union_get_inode_locked(parent->dentry, path,
+				       &lower_cache, &actual);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		goto unlock_out;
+	}
+
+	/* Do the copy up */
+	ret = union_copy_up_via_tmpfile(parent, path, &actual,
+					will_truncate ? &zero : 0);
+	mutex_unlock(&parent->dentry->d_inode->i_mutex);
+	path_put_maybe(&lower_cache);
+
+	pr_devel("<--%s() = %d [post]\n", __func__, ret);
+	return ret;
+
+unlock_out:
+	mutex_unlock(&parent->dentry->d_inode->i_mutex);
+	pr_devel("<--%s() = %d [pre]\n", __func__, ret);
+	return ret;
+}
diff --git a/fs/union.h b/fs/union.h
new file mode 100644
index 0000000..8b605c1
--- /dev/null
+++ b/fs/union.h
@@ -0,0 +1,363 @@
+/* VFS-based union mounts for Linux
+ *
+ * Copyright (C) 2004-2007 IBM Corporation, IBM Deutschland Entwicklung GmbH.
+ * Copyright (C) 2007-2009 Novell Inc.
+ * Copyright (C) 2009-2012 Red Hat, Inc.
+ *
+ *   Author(s): Jan Blunck (j.blunck@tu-harburg.de)
+ *              Valerie Aurora <vaurora@redhat.com>
+ *              David Howells <dhowells@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/mount.h>
+#include <linux/dcache.h>
+#include <linux/namei.h>
+#include <linux/path.h>
+#include <linux/slab.h>
+#include <linux/bug.h>
+
+/*
+ * WARNING! Confusing terminology alert.
+ *
+ * Note that the directions "up" and "down" in union mounts are the opposite of
+ * "up" and "down" in normal VFS operation terminology.  "Up" in the rest of
+ * the VFS means "towards the root of the mount tree."  If you mount B on top
+ * of A, following B "up" will get you A.  In union mounts, "up" means "towards
+ * the most recently mounted layer of the union stack."  If you union mount B
+ * on top of A, following A "up" will get you to B.  Another way to put it is
+ * that "up" in the VFS means going from this mount towards the direction of
+ * its mnt->mnt_parent pointer, but "up" in union mounts means going in the
+ * opposite direction (until you run out of union layers).
+ */
+
+/*
+ * The union_stack structure.  It is an array of struct paths of
+ * directories below the topmost directory in a unioned directory, The
+ * topmost dentry has a pointer to this structure.  The topmost dentry
+ * can only be part of one union, so we can reference it from the
+ * dentry, but lower dentries can be part of multiple union stacks.
+ *
+ * The number of dirs actually allocated is kept in the superblock,
+ * s_union_count.
+ */
+struct union_stack {
+	struct path u_dirs[0];
+};
+
+/**
+ * union_alloc - allocate a union stack
+ * @path: path of topmost directory
+ *
+ * Allocate a union_stack large enough to contain the maximum number
+ * of layers in this union mount.
+ */
+static inline struct union_stack *union_alloc_stack(const struct path *topmost)
+{
+	unsigned layers = topmost->dentry->d_sb->s_union_count;
+	return kcalloc(sizeof(struct path), layers, GFP_KERNEL);
+}
+
+#ifdef CONFIG_UNION_MOUNT
+
+static inline bool IS_MNT_UNION(const struct vfsmount *mnt)
+{
+	return mnt->mnt_flags & MNT_UNION;
+}
+
+static inline bool IS_PATH_UNIONED(const struct path *path)
+{
+	return IS_MNT_UNION(path->mnt);
+}
+
+static inline bool IS_DIR_UNIONED(const struct dentry *dentry)
+{
+	return !!dentry->d_union_stack;
+}
+
+extern void d_free_unions(struct dentry *);
+extern int union_add_dir(struct path *, struct path *, unsigned int);
+
+static inline
+struct path *union_find_dir(struct dentry *dentry, unsigned int layer)
+{
+	BUG_ON(layer >= dentry->d_sb->s_union_count);
+	return &dentry->d_union_stack->u_dirs[layer];
+}
+
+
+extern int union_create_topmost_dir(struct path *, struct path *, struct union_stack *);
+
+extern int __union_copy_up_dir(struct path *);
+
+#else /* CONFIG_UNION_MOUNT */
+
+static inline bool IS_MNT_UNION(struct vfsmount *mnt) { return false; }
+static inline bool IS_PATH_UNIONED(const struct path *path) { return false; }
+static inline bool IS_DIR_UNIONED(struct dentry *dentry) { return false; }
+static inline void d_free_unions(struct dentry *dentry) {}
+
+static inline
+int union_add_dir(struct path *topmost, struct path *lower, unsigned layer)
+{
+	BUG();
+	return 0;
+}
+
+static inline struct path *union_find_dir(struct dentry *dentry, unsigned layer)
+{
+	BUG();
+	return NULL;
+}
+
+static inline int union_create_topmost_dir(struct path *parent,
+					   struct path *topmost,
+					   struct union_stack *d)
+{
+	BUG();
+	return 0;
+}
+
+static inline int __union_copy_up_dir(struct path *topmost_path)
+{
+	BUG();
+	return 0;
+}
+
+#endif	/* CONFIG_UNION_MOUNT */
+
+/*
+ * Make sure that an upper directory is opaque (ie. totally copied up if it is
+ * in fact unioned with some lower dirs).
+ */
+static inline int union_copy_up_dir(struct path *path)
+{
+	if (IS_OPAQUE(path->dentry->d_inode))
+		return 0;
+	return __union_copy_up_dir(path);
+}
+
+extern struct inode *__union_get_inode_locked(struct dentry *parent,
+					      struct path *upper,
+					      struct path *_lower_cache,
+					      struct path *_actual);
+extern struct inode *__union_get_inode(struct path *upper,
+				       struct path *_lower_cache,
+				       struct path *_actual);
+extern int __union_copy_up(struct path *path, struct path *actual,
+			   const loff_t *truncate_to);
+
+extern int __union_copy_up_locked(struct path *parent, struct path *path,
+				  struct path *actual,
+				  const loff_t *truncate_to);
+
+static inline void path_put_maybe(struct path *path)
+{
+	/* These optimise away if CONFIG_UNION_MOUNT=n */
+	if (unlikely(path->dentry))
+		dput(path->dentry);
+	if (unlikely(path->mnt))
+		mntput(path->mnt);
+}
+
+/**
+ * union_get_inode_locked - Get the actual inode and dentry for a dentry
+ * @parent: The locked parent of the object we're interested in.
+ * @path: The object we're interested in.
+ * @_lower_cache: Cache for lower dentry pinning.
+ * @_actual: The point actually corresponding to the returned inode.
+ *
+ * Gets the inode to be used for a dentry where that inode may exist on a lower
+ * layer in a union.  Note that we don't get a ref on the inode, so to pin it
+ * temporarily, we may point *_lower at the lower dentry.
+ *
+ * The caller must hold i_mutex on the parent.
+ *
+ * Returns a pointer to the inode to use if a positive dentry is found, NULL if
+ * a negative dentry is found and an error if lookup in the lower layers
+ * failed.
+ *
+ * On a successful return (positive or negative dentry), *_actual will be set
+ * to point to the dentry that we determined was the one of interest.  This
+ * does not hold any refs of its own.
+ *
+ * The caller should call path_put_maybe() on *_lower_cache to clear any pins
+ * it may contain.
+ */
+static inline struct inode *union_get_inode_locked(struct dentry *parent,
+						   struct path *path,
+						   struct path *_lower_cache,
+						   struct path *_actual)
+{
+	/* Optimise for the non-unionmount case. */
+	_lower_cache->dentry = NULL;
+	_lower_cache->mnt = NULL;
+	*_actual = *path;
+
+#ifndef CONFIG_UNION_MOUNT
+	return path->dentry->d_inode;
+#else
+	/* The normal case is that the inode is right where we expect... */
+	if (likely(path->dentry->d_inode))
+		return path->dentry->d_inode;
+
+	/* ... or the dentry is ordinarily negative. */
+	if (likely(!path->dentry->d_sb->s_union_lower_mnts))
+		return NULL;
+
+	if (d_is_whiteout(path->dentry) ||
+	    (!d_is_fallthru(path->dentry) && IS_OPAQUE(parent->d_inode)))
+		return NULL;
+
+	/* We have to lock the parent and do a lookup. */
+	return __union_get_inode_locked(parent, path, _lower_cache, _actual);
+#endif
+}
+
+/**
+ * union_get_inode - Get the actual inode and dentry for an object
+ * @path: The object we're interested in.
+ * @_lower_cache: Cache for lower dentry pinning.
+ * @_actual: The point actually corresponding to the returned inode.
+ *
+ * Gets the inode to be used for a dentry where that inode may exist on a lower
+ * layer in a union.  Note that we don't get a ref on the inode, so to pin it
+ * temporarily, we may return a dentry in *_lower.
+ *
+ * Returns a pointer to the inode to use if a positive dentry is found, NULL if
+ * a negative dentry is found and an error if lookup in the lower layers
+ * failed.
+ *
+ * On a successful return (positive or negative dentry), *_actual will be set
+ * to point to the dentry that we determined was the one of interest.  This
+ * does not have its own ref taken and thus does not need to be dput().
+ */
+static inline struct inode *union_get_inode(struct path *path,
+					    struct path *_lower_cache,
+					    struct path *_actual)
+{
+	_lower_cache->mnt = NULL;
+	_lower_cache->dentry = NULL;
+	*_actual = *path;
+
+#ifndef CONFIG_UNION_MOUNT
+	return path->dentry->d_inode;
+#else
+	/* The normal case is that the inode is right where we expect... */
+	if (likely(path->dentry->d_inode))
+		return path->dentry->d_inode;
+
+	/* ... or the dentry is ordinarily negative. */
+	if (likely(!path->dentry->d_sb->s_union_lower_mnts))
+		return NULL;
+
+	if (d_is_whiteout(path->dentry))
+		return NULL;
+
+	/* We have to lock the parent and do a lookup. */
+	return __union_get_inode(path, _lower_cache, _actual);
+#endif
+}
+
+/**
+ * union_truncated_copy_up - If needed, partially copy up a file (truncate)
+ * path: The target object.
+ * lower: The lower dentry (or NULL) from union_get_inode().
+ * truncate_to: The amount to copy up.
+ */
+static inline int union_truncated_copy_up(struct path *path, struct path *actual,
+					  const loff_t *truncate_to)
+{
+#ifdef CONFIG_UNION_MOUNT
+	if (unlikely(!path->dentry->d_inode))
+		return __union_copy_up(path, actual, truncate_to);
+#endif
+	return 0;
+}
+
+/**
+ * union_copy_up - If needed, copy up a file in its entirety
+ * path: The target object.
+ * lower: The lower dentry (or NULL) from union_get_inode().
+ */
+static inline int union_copy_up(struct path *path, struct path *actual)
+{
+#ifdef CONFIG_UNION_MOUNT
+	if (unlikely(!path->dentry->d_inode))
+		return __union_copy_up(path, actual, NULL);
+#endif
+	return 0;
+}
+
+/**
+ * union_copy_up_locked - If needed, copy up a file, caller holds parent lock
+ * parent: The parent directory of the target object
+ * path: The target object.
+ * lower: The lower dentry (or NULL) from union_get_inode().
+ *
+ * The parent must hold i_mutex on the parent directory.
+ */
+static inline int union_copy_up_locked(struct path *parent, struct path *path,
+				       struct path *actual)
+{
+#ifdef CONFIG_UNION_MOUNT
+	if (unlikely(!path->dentry->d_inode))
+	//	return __union_copy_up_locked(parent, path, actual, true, 0);
+		return -ENOANO;
+#endif
+	return 0;
+	
+}
+
+extern int __union_copy_up_for_do_last(struct path *, struct path *, bool);
+
+/**
+ * union_copy_up_do_last - If needed, copy up a file (maybe truncated)
+ * path: The target object.
+ * lower: The lower dentry (or NULL) from union_get_inode().
+ * will_truncate: Whether to honour O_TRUNC or not.
+ */
+static inline int union_copy_up_for_do_last(struct path *parent, struct path *path,
+					    bool will_truncate)
+{
+#ifdef CONFIG_UNION_MOUNT
+	if (unlikely(!path->dentry->d_inode))
+		return __union_copy_up_for_do_last(parent, path, will_truncate);
+#endif
+	return 0;
+}
+
+static inline bool d_is_unioned(const struct dentry *dentry, const struct path *actual)
+{
+#ifndef CONFIG_UNION_MOUNT
+	return false;
+#else
+	return unlikely(dentry != actual->dentry);
+#endif
+}
+
+static inline bool is_unioned(const struct dentry *dentry, const struct inode *inode)
+{
+#ifndef CONFIG_UNION_MOUNT
+	return false;
+#else
+	return unlikely(dentry->d_inode != inode);
+#endif
+}
+
+extern struct union_stack *union_alloc(const struct path *topmost);
+
+static inline void union_free(const struct path *path, struct union_stack *d)
+{
+	unsigned i, layers = path->dentry->d_sb->s_union_count;
+
+	if (d) {
+		for (i = 0; i < layers; i++)
+			path_put(&d->u_dirs[i]);
+		kfree(d);
+	}
+}
diff --git a/fs/utimes.c b/fs/utimes.c
index aa138d6..a42c98f 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -10,6 +10,8 @@
 #include <linux/syscalls.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
+#include "internal.h"
+#include "union.h"
 
 #ifdef __ARCH_WANT_SYS_UTIME
 
@@ -54,10 +56,17 @@
 	struct iattr newattrs;
 	struct inode *inode = path->dentry->d_inode;
 	struct inode *delegated_inode = NULL;
+	struct path lower_cache, actual;
+
+	inode = union_get_inode(path, &lower_cache, &actual);
+	if (IS_ERR(inode)) {
+		error = PTR_ERR(inode);
+		goto out;
+	}
 
 	error = mnt_want_write(path->mnt);
 	if (error)
-		goto out;
+		goto out_put_cache;
 
 	if (times && times[0].tv_nsec == UTIME_NOW &&
 		     times[1].tv_nsec == UTIME_NOW)
@@ -97,12 +106,27 @@
 			goto mnt_drop_write_and_out;
 
 		if (!inode_owner_or_capable(inode)) {
-			error = inode_permission(inode, MAY_WRITE);
-			if (error)
-				goto mnt_drop_write_and_out;
+			/* We have to be able to write to the upperfs. */
+			if (d_is_unioned(path->dentry, &actual)) {
+				error = -EROFS;
+				if (path->dentry->d_sb->s_flags & MS_RDONLY)
+					goto mnt_drop_write_and_out;
+				error = __inode_permission(inode, MAY_WRITE);
+				if (error)
+					goto mnt_drop_write_and_out;
+			} else {
+				error = inode_permission(inode, MAY_WRITE);
+				if (error)
+					goto mnt_drop_write_and_out;
+			}
 		}
 	}
+
 retry_deleg:
+	error = union_copy_up(path, &actual);
+	if (error)
+		goto mnt_drop_write_and_out;
+
 	mutex_lock(&inode->i_mutex);
 	error = notify_change(path->dentry, &newattrs, &delegated_inode);
 	mutex_unlock(&inode->i_mutex);
@@ -114,6 +138,8 @@
 
 mnt_drop_write_and_out:
 	mnt_drop_write(path->mnt);
+out_put_cache:
+	path_put_maybe(&lower_cache);
 out:
 	return error;
 }
diff --git a/fs/xattr.c b/fs/xattr.c
index ead3e34..912e399 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -23,13 +23,19 @@
 #include <linux/posix_acl_xattr.h>
 
 #include <asm/uaccess.h>
+#include "internal.h"
+#include "union.h"
 
 /*
  * Check permissions for extended attribute access.  This is a bit complicated
  * because different namespaces have very different rules.
+ *
+ * Note: in unionmount conditions, dentry must be on the _upper_ layer whilst
+ * inode may be on the lower.
  */
 static int
-xattr_permission(struct inode *inode, const char *name, int mask)
+xattr_permission(struct dentry *dentry, struct inode *inode, const char *name,
+		 int mask)
 {
 	/*
 	 * We can never set or remove an extended attribute on a read-only
@@ -70,7 +76,13 @@
 			return -EPERM;
 	}
 
-	return inode_permission(inode, mask);
+	if (is_unioned(dentry, inode)) {
+		if (mask & MAY_WRITE && dentry->d_sb->s_flags & MS_RDONLY)
+			return -EROFS;
+		return __inode_permission(inode, mask);
+	} else {
+		return inode_permission(inode, mask);
+	}
 }
 
 /**
@@ -87,7 +99,7 @@
  *
  *  This function requires the caller to lock the inode's i_mutex before it
  *  is executed. It also assumes that the caller will make the appropriate
- *  permission checks.
+ *  permission checks.  The caller must also have copied up for unionmount.
  */
 int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
 		const void *value, size_t size, int flags)
@@ -97,6 +109,8 @@
 	int issec = !strncmp(name, XATTR_SECURITY_PREFIX,
 				   XATTR_SECURITY_PREFIX_LEN);
 
+	if (!inode)
+		return -ENOENT;
 	if (issec)
 		inode->i_flags &= ~S_NOSEC;
 	if (inode->i_op->setxattr) {
@@ -122,23 +136,40 @@
 vfs_setxattr(struct path *path, const char *name, const void *value,
 		size_t size, int flags)
 {
-	struct dentry *dentry = path->dentry;
-	struct inode *inode = dentry->d_inode;
+	struct path lower_cache, actual;
+	struct inode *inode;
 	int error;
 
-	error = xattr_permission(inode, name, MAY_WRITE);
+	inode = union_get_inode(path, &lower_cache, &actual);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+again:
+	error = xattr_permission(actual.dentry, inode, name, MAY_WRITE);
 	if (error)
-		return error;
+		goto out_lower;
 
 	mutex_lock(&inode->i_mutex);
-	error = security_inode_setxattr(dentry, name, value, size, flags);
+	error = security_inode_setxattr(actual.dentry, name, value, size, flags);
 	if (error)
-		goto out;
+		goto out_unlock;
 
-	error = __vfs_setxattr_noperm(dentry, name, value, size, flags);
+	if (d_is_unioned(path->dentry, &actual)) {
+		/* Unionmounted */
+		mutex_unlock(&inode->i_mutex);
+		error = union_copy_up(path, &actual);
+		if (error)
+			goto out_lower;
+		inode = actual.dentry->d_inode;
+		goto again;
+	}
 
-out:
+	error = __vfs_setxattr_noperm(actual.dentry, name, value, size, flags);
+
+out_unlock:
 	mutex_unlock(&inode->i_mutex);
+out_lower:
+	path_put_maybe(&lower_cache);
 	return error;
 }
 EXPORT_SYMBOL_GPL(vfs_setxattr);
@@ -186,7 +217,7 @@
 	char *value = *xattr_value;
 	int error;
 
-	error = xattr_permission(inode, name, MAY_READ);
+	error = xattr_permission(dentry, inode, name, MAY_READ);
 	if (error)
 		return error;
 
@@ -231,55 +262,72 @@
 ssize_t
 vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size)
 {
-	struct inode *inode = dentry->d_inode;
-	int error;
+	struct inode *inode;
+	struct path lower_cache, actual;
+	struct path path = { .dentry = dentry };
+	ssize_t error;
 
-	error = xattr_permission(inode, name, MAY_READ);
-	if (error)
-		return error;
+	inode = union_get_inode(&path, &lower_cache, &actual);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
 
-	error = security_inode_getxattr(dentry, name);
+	error = xattr_permission(dentry, inode, name, MAY_READ);
 	if (error)
-		return error;
+		goto out_dput;
+
+	error = security_inode_getxattr(actual.dentry, name);
+	if (error)
+		goto out_dput;
 
 	if (!strncmp(name, XATTR_SECURITY_PREFIX,
 				XATTR_SECURITY_PREFIX_LEN)) {
 		const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
-		int ret = xattr_getsecurity(inode, suffix, value, size);
+		ssize_t ret = xattr_getsecurity(inode, suffix, value, size);
 		/*
 		 * Only overwrite the return value if a security module
 		 * is actually active.
 		 */
 		if (ret == -EOPNOTSUPP)
 			goto nolsm;
-		return ret;
+		error = ret;
+		goto out_dput;
 	}
 nolsm:
 	if (inode->i_op->getxattr)
-		error = inode->i_op->getxattr(dentry, name, value, size);
+		error = inode->i_op->getxattr(actual.dentry, name, value, size);
 	else
 		error = -EOPNOTSUPP;
 
+out_dput:
+	path_put_maybe(&lower_cache);
 	return error;
 }
 EXPORT_SYMBOL_GPL(vfs_getxattr);
 
 ssize_t
-vfs_listxattr(struct dentry *d, char *list, size_t size)
+vfs_listxattr(struct dentry *dentry, char *list, size_t size)
 {
+	struct inode *inode;
+	struct path lower_cache, actual;
+	struct path path = { .dentry = dentry };
 	ssize_t error;
 
-	error = security_inode_listxattr(d);
-	if (error)
-		return error;
-	error = -EOPNOTSUPP;
-	if (d->d_inode->i_op->listxattr) {
-		error = d->d_inode->i_op->listxattr(d, list, size);
-	} else {
-		error = security_inode_listsecurity(d->d_inode, list, size);
-		if (size && error > size)
-			error = -ERANGE;
+	inode = union_get_inode(&path, &lower_cache, &actual);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	error = security_inode_listxattr(actual.dentry);
+	if (!error) {
+		error = -EOPNOTSUPP;
+		if (inode->i_op->listxattr) {
+			error = inode->i_op->listxattr(actual.dentry, list, size);
+		} else {
+			error = security_inode_listsecurity(inode, list, size);
+			if (size && error > size)
+				error = -ERANGE;
+		}
 	}
+	path_put_maybe(&lower_cache);
 	return error;
 }
 EXPORT_SYMBOL_GPL(vfs_listxattr);
@@ -287,31 +335,48 @@
 int
 vfs_removexattr(struct path *path, const char *name)
 {
-	struct dentry *dentry = path->dentry;
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode;
+	struct path lower_cache, actual;
 	int error;
 
-	if (!inode->i_op->removexattr)
-		return -EOPNOTSUPP;
+	inode = union_get_inode(path, &lower_cache, &actual);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
 
-	error = xattr_permission(inode, name, MAY_WRITE);
+again:
+	error = -EOPNOTSUPP;
+	if (!inode->i_op->removexattr)
+		goto out;
+
+	error = xattr_permission(path->dentry, inode, name, MAY_WRITE);
 	if (error)
-		return error;
+		goto out;
 
 	mutex_lock(&inode->i_mutex);
-	error = security_inode_removexattr(dentry, name);
+	error = security_inode_removexattr(actual.dentry, name);
 	if (error) {
 		mutex_unlock(&inode->i_mutex);
-		return error;
+		goto out;
 	}
 
-	error = inode->i_op->removexattr(dentry, name);
+	if (d_is_unioned(path->dentry, &actual)) {
+		mutex_unlock(&inode->i_mutex);
+		error = union_copy_up(path, &actual);
+		if (error)
+			goto out;
+		inode = actual.dentry->d_inode;
+		goto again;
+	}
+
+	error = inode->i_op->removexattr(actual.dentry, name);
 	mutex_unlock(&inode->i_mutex);
 
 	if (!error) {
-		fsnotify_xattr(dentry);
-		evm_inode_post_removexattr(dentry, name);
+		fsnotify_xattr(actual.dentry);
+		evm_inode_post_removexattr(actual.dentry, name);
 	}
+out:
+	path_put_maybe(&lower_cache);
 	return error;
 }
 EXPORT_SYMBOL_GPL(vfs_removexattr);
@@ -426,12 +491,16 @@
 	if (!f.file)
 		return error;
 	dentry = f.file->f_path.dentry;
+	error = -EACCES;
+	if (f.file->f_inode != dentry->d_inode)
+		goto error; /* Can't alter an open lower union file this way */
 	audit_inode(NULL, dentry, 0);
 	error = mnt_want_write_file(f.file);
 	if (!error) {
 		error = setxattr(&f.file->f_path, name, value, size, flags);
 		mnt_drop_write_file(f.file);
 	}
+error:
 	fdput(f);
 	return error;
 }
@@ -526,13 +595,15 @@
 SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
 		void __user *, value, size_t, size)
 {
+	struct dentry *dentry;
 	struct fd f = fdget(fd);
 	ssize_t error = -EBADF;
 
 	if (!f.file)
 		return error;
-	audit_inode(NULL, f.file->f_path.dentry, 0);
-	error = getxattr(f.file->f_path.dentry, name, value, size);
+	audit_file(NULL, f.file, 0);
+	dentry = d_dentry_or_lower(f.file->f_path.dentry);
+	error = getxattr(dentry, name, value, size);
 	fdput(f);
 	return error;
 }
@@ -615,13 +686,15 @@
 
 SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
 {
+	struct dentry *dentry;
 	struct fd f = fdget(fd);
 	ssize_t error = -EBADF;
 
 	if (!f.file)
 		return error;
-	audit_inode(NULL, f.file->f_path.dentry, 0);
-	error = listxattr(f.file->f_path.dentry, list, size);
+	audit_file(NULL, f.file, 0);
+	dentry = d_dentry_or_lower(f.file->f_path.dentry);
+	error = listxattr(dentry, list, size);
 	fdput(f);
 	return error;
 }
@@ -701,12 +774,16 @@
 	if (!f.file)
 		return error;
 	dentry = f.file->f_path.dentry;
+	error = -EACCES;
+	if (f.file->f_inode != dentry->d_inode)
+		goto error; /* Can't alter an open lower union file this way */
 	audit_inode(NULL, dentry, 0);
 	error = mnt_want_write_file(f.file);
 	if (!error) {
 		error = removexattr(&f.file->f_path, name);
 		mnt_drop_write_file(f.file);
 	}
+error:
 	fdput(f);
 	return error;
 }
diff --git a/include/linux/audit.h b/include/linux/audit.h
index a406419..f863680 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -108,8 +108,10 @@
 
 #define AUDIT_INODE_PARENT	1	/* dentry represents the parent */
 #define AUDIT_INODE_HIDDEN	2	/* audit record should be hidden */
-extern void __audit_inode(struct filename *name, const struct dentry *dentry,
-				unsigned int flags);
+extern void __audit_dentry(struct filename *name, const struct dentry *dentry,
+			   unsigned flags);
+extern void __audit_file(struct filename *name, const struct file *file,
+			 unsigned flags);
 extern void __audit_inode_child(const struct inode *parent,
 				const struct dentry *dentry,
 				const unsigned char type);
@@ -153,23 +155,30 @@
 	if (unlikely(!audit_dummy_context()))
 		__audit_getname(name);
 }
+
 static inline void audit_inode(struct filename *name,
-				const struct dentry *dentry,
-				unsigned int parent) {
-	if (unlikely(!audit_dummy_context())) {
-		unsigned int flags = 0;
-		if (parent)
-			flags |= AUDIT_INODE_PARENT;
-		__audit_inode(name, dentry, flags);
-	}
+			       const struct dentry *dentry,
+			       bool parent)
+{
+	if (unlikely(!audit_dummy_context()))
+		__audit_dentry(name, dentry, parent ? AUDIT_INODE_PARENT : 0);
 }
+
 static inline void audit_inode_parent_hidden(struct filename *name,
 						const struct dentry *dentry)
 {
 	if (unlikely(!audit_dummy_context()))
-		__audit_inode(name, dentry,
-				AUDIT_INODE_PARENT | AUDIT_INODE_HIDDEN);
+		__audit_dentry(name, dentry,
+			       AUDIT_INODE_PARENT | AUDIT_INODE_HIDDEN);
 }
+
+static inline void audit_file(struct filename *name, const struct file *file,
+			      bool parent)
+{
+	if (unlikely(!audit_dummy_context()))
+		__audit_file(name, file, parent ? AUDIT_INODE_PARENT : 0);
+}
+
 static inline void audit_inode_child(const struct inode *parent,
 				     const struct dentry *dentry,
 				     const unsigned char type) {
@@ -325,21 +334,20 @@
 { }
 static inline void audit_putname(struct filename *name)
 { }
-static inline void __audit_inode(struct filename *name,
-					const struct dentry *dentry,
-					unsigned int flags)
-{ }
 static inline void __audit_inode_child(const struct inode *parent,
 					const struct dentry *dentry,
 					const unsigned char type)
 { }
 static inline void audit_inode(struct filename *name,
 				const struct dentry *dentry,
-				unsigned int parent)
+				bool parent)
 { }
 static inline void audit_inode_parent_hidden(struct filename *name,
 				const struct dentry *dentry)
 { }
+static inline void audit_file(struct filename *name, const struct file *file,
+			      bool parent)
+{ }
 static inline void audit_inode_child(const struct inode *parent,
 				     const struct dentry *dentry,
 				     const unsigned char type)
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 0f3ed77..c68b6aa 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -14,6 +14,7 @@
 struct nameidata;
 struct path;
 struct vfsmount;
+struct union_stack;
 
 /*
  * linux/include/linux/dcache.h
@@ -92,16 +93,36 @@
  * Try to keep struct dentry aligned on 64 byte cachelines (this will
  * give reasonable cacheline footprint with larger lines without the
  * large memory footprint increase).
+ *
+ * XXX DNAME_INLINE_LEN_MIN is kind of pitiful on 64bit + union
+ * mounts.  May be worth tuning up, but either we go to 256 bytes and
+ * a wasteful 88 bytes of d_iname, or we lose 64-byte aligment.
  */
 #ifdef CONFIG_64BIT
+
+#ifdef CONFIG_UNION_MOUNT
+# define DNAME_INLINE_LEN 24 /* 192 bytes */
+#else
 # define DNAME_INLINE_LEN 32 /* 192 bytes */
+#endif /* CONFIG_UNION_MOUNT */
+
+#else
+
+#ifdef CONFIG_UNION_MOUNT
+# ifdef CONFIG_SMP
+#  define DNAME_INLINE_LEN 32 /* 128 bytes */
+# else
+#  define DNAME_INLINE_LEN 36 /* 128 bytes */
+# endif
 #else
 # ifdef CONFIG_SMP
 #  define DNAME_INLINE_LEN 36 /* 128 bytes */
 # else
 #  define DNAME_INLINE_LEN 40 /* 128 bytes */
 # endif
-#endif
+#endif /* CONFIG_UNION_MOUNT */
+
+#endif /* CONFIG_64BIT */
 
 #define d_lock	d_lockref.lock
 
@@ -123,6 +144,12 @@
 	unsigned long d_time;		/* used by d_revalidate */
 	void *d_fsdata;			/* fs-specific data */
 
+#ifdef CONFIG_UNION_MOUNT
+	union {
+		struct union_stack *d_union_stack; /* Dirs in union stack */
+		struct dentry *d_fallthru; /* Lower dentry pinned by fallthru */
+	};
+#endif
 	struct list_head d_lru;		/* LRU list */
 	/*
 	 * d_child and d_rcu can share memory
@@ -495,6 +522,71 @@
 #endif
 }
 
+static inline bool d_has_lower(const struct dentry *dentry)
+{
+	return unlikely(!dentry->d_inode &&
+			(d_is_fallthru(dentry) ||
+			 dentry->d_flags & DCACHE_UNION_PINNING_LOWER));
+}
+
+static inline void d_set_union_stack(struct dentry *dentry, struct union_stack *d)
+{
+#ifdef CONFIG_UNION_MOUNT
+	BUG_ON(dentry->d_union_stack != NULL);
+	dentry->d_union_stack = d;
+#endif
+}
+
+static inline void d_pin_lower(struct dentry *dentry, struct dentry *lower)
+{
+#ifdef CONFIG_UNION_MOUNT
+	BUG_ON(dentry->d_fallthru != NULL);
+	dentry->d_fallthru = lower;
+	smp_wmb();
+	dentry->d_flags |= DCACHE_UNION_PINNING_LOWER;
+#endif
+}
+
+static inline bool d_is_pinning_lower(const struct dentry *dentry)
+{
+#ifdef CONFIG_UNION_MOUNT
+	if (unlikely(dentry->d_flags & DCACHE_UNION_PINNING_LOWER)) {
+		smp_rmb(); /* d_fallthru must be read only after this flag is
+			    * checked. */
+		return true;
+	}
+#endif
+	return false;
+}
+
+static inline struct dentry *d_get_fallthru(struct dentry *dentry)
+{
+#ifdef CONFIG_UNION_MOUNT
+	return dentry->d_fallthru;
+#else
+	return NULL;
+#endif
+}
+
+static inline struct dentry *d_dentry_or_lower(struct dentry *dentry)
+{
+#ifdef CONFIG_UNION_MOUNT
+	return dentry->d_inode ? dentry : dentry->d_fallthru;
+#else
+	return dentry;
+#endif
+}
+
+static inline struct inode *d_inode_or_lower(struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+#ifdef CONFIG_UNION_MOUNT
+	if (!inode && d_is_pinning_lower(dentry))
+		inode = dentry->d_fallthru->d_inode;
+#endif
+	return inode;
+}
+
 extern int sysctl_vfs_cache_pressure;
 
 static inline unsigned long vfs_pressure_ratio(unsigned long val)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3ee9f62..1b48483 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1325,6 +1325,16 @@
 	/* AIO completions deferred from interrupt context */
 	struct workqueue_struct *s_dio_done_wq;
 
+	/* Root of the private cloned vfsmount tree of the read-only
+	 * mounts in this union (set in topmost vfsmount only)
+	 */
+	struct vfsmount *s_union_lower_mnts;
+
+	/* Number of layers in this union, not counting the topmost or
+	 * submounts.
+	 */
+	unsigned int s_union_count;
+
 	/*
 	 * Keep the lru lists last in the structure so they always sit on their
 	 * own individual cachelines.
@@ -2657,6 +2667,21 @@
 
 extern int generic_check_addressable(unsigned, u64);
 
+#ifdef CONFIG_UNION_MOUNT
+extern int generic_readdir_fallthru(struct dentry *topmost_dentry, const char *name,
+				    int namlen, ino_t *ino, unsigned char *d_type);
+#else
+static inline int generic_readdir_fallthru(struct dentry *topmost_dentry, const char *name,
+					   int namlen, ino_t *ino, unsigned char *d_type)
+{
+	/*
+	 * Found a fallthru on a kernel without union support.
+	 * There's nothing to fall through to, so return -ENOENT.
+	 */
+	return -ENOENT;
+}
+#endif
+
 #ifdef CONFIG_MIGRATION
 extern int buffer_migrate_page(struct address_space *,
 				struct page *, struct page *,
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 562f453..52f443f 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -83,4 +83,6 @@
 
 extern dev_t name_to_dev_t(char *name);
 
+extern void put_union_sb(struct super_block *sb);
+
 #endif /* _LINUX_MOUNT_H */
diff --git a/include/linux/security.h b/include/linux/security.h
index 5623a7f..83034ad 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -2951,7 +2951,8 @@
 int security_path_link(struct dentry *old_dentry, struct path *new_dir,
 		       struct dentry *new_dentry);
 int security_path_rename(struct path *old_dir, struct dentry *old_dentry,
-			 struct path *new_dir, struct dentry *new_dentry);
+			 struct path *new_dir, struct dentry *new_dentry,
+			 struct inode *old_inode);
 int security_path_chmod(struct path *path, umode_t mode);
 int security_path_chown(struct path *path, kuid_t uid, kgid_t gid);
 int security_path_chroot(struct path *path);
@@ -2999,7 +3000,8 @@
 static inline int security_path_rename(struct path *old_dir,
 				       struct dentry *old_dentry,
 				       struct path *new_dir,
-				       struct dentry *new_dentry)
+				       struct dentry *new_dentry,
+				       struct inode *old_inode)
 {
 	return 0;
 }
diff --git a/kernel/audit.c b/kernel/audit.c
index 906ae5a0..e2d004b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1753,6 +1753,7 @@
 {
 	struct audit_buffer *ab;
 	struct audit_names *name;
+	struct inode *inode;
 
 	name = kzalloc(sizeof(*name), GFP_NOFS);
 	if (!name)
@@ -1770,7 +1771,12 @@
 
 	/* Generate AUDIT_PATH record with object. */
 	name->type = AUDIT_TYPE_NORMAL;
-	audit_copy_inode(name, link->dentry, link->dentry->d_inode);
+	inode = link->dentry->d_inode;
+#ifdef CONFIG_UNION_MOUNT
+	if (!inode)
+		inode = link->dentry->d_fallthru->d_inode;
+#endif
+	audit_copy_inode(name, link->dentry, inode);
 	audit_log_name(current->audit_context, name, link, 0, NULL);
 out:
 	kfree(name);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 90594c9..7cab86d 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1757,11 +1757,12 @@
  * @dentry: dentry being audited
  * @flags: attributes for this particular entry
  */
-void __audit_inode(struct filename *name, const struct dentry *dentry,
-		   unsigned int flags)
+static void __audit_inode(struct filename *name,
+			  const struct dentry *dentry,
+			  const struct inode *inode,
+			  unsigned int flags)
 {
 	struct audit_context *context = current->audit_context;
-	const struct inode *inode = dentry->d_inode;
 	struct audit_names *n;
 	bool parent = flags & AUDIT_INODE_PARENT;
 
@@ -1829,6 +1830,32 @@
 }
 
 /**
+ * __audit_dentry - store the inode and device from a lookup
+ * @name: name being audited (optional)
+ * @dentry: dentry being audited
+ * @flags: attributes for this particular entry
+ */
+void __audit_dentry(struct filename *name, const struct dentry *dentry,
+		    unsigned flags)
+{
+	if (unlikely(!audit_dummy_context()))
+		__audit_inode(name, dentry, dentry->d_inode, flags);
+}
+
+/**
+ * __audit_file - store the inode and device from an open file
+ * @name: name being audited (optional)
+ * @file: dentry being audited
+ * @flags: attributes for this particular entry
+ */
+void __audit_file(struct filename *name, const struct file *file,
+		  unsigned flags)
+{
+	if (unlikely(!audit_dummy_context()))
+		__audit_inode(name, file->f_path.dentry, file->f_inode, flags);
+}
+
+/**
  * __audit_inode_child - collect inode info for created/removed objects
  * @parent: inode of dentry parent
  * @dentry: dentry being audited
diff --git a/mm/shmem.c b/mm/shmem.c
index 2ece54b..d1acfbd 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1976,6 +1976,7 @@
 		ret = PTR_ERR(dentry);
 		goto error_free;
 	}
+	d_set_type(whiteout, DCACHE_WHITEOUT_TYPE);
 
 	if (old_dentry->d_inode || d_is_fallthru(old_dentry)) {
 		/* A fallthru for a dir is treated like a regular link */
@@ -2017,7 +2018,10 @@
 		dir->i_size += BOGO_DIRENT_SIZE;
 		dget(dentry); /* Extra count - pin the dentry in core */
 	}
-	/* Will clear DCACHE_WHITEOUT and DCACHE_FALLTHRU flags */
+
+	/* Attach the inode to the dentry - this will set the DCACHE_ENTRY_TYPE
+	 * field and clear DCACHE_FALLTHRU flags.
+	 */
 	d_instantiate(dentry, inode);
 }
 
@@ -2052,11 +2056,8 @@
 	}
 
 	shmem_d_instantiate(dir, dentry, NULL);
+	d_set_fallthru(dentry);
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
-
-	spin_lock(&dentry->d_lock);
-	dentry->d_flags |= DCACHE_FALLTHRU;
-	spin_unlock(&dentry->d_lock);
 	return 0;
 }
 
@@ -2155,9 +2156,11 @@
 	 * but each new link needs a new dentry, pinning lowmem, and
 	 * tmpfs dentries cannot be pruned until they are unlinked.
 	 */
-	ret = shmem_reserve_inode(inode->i_sb);
-	if (ret)
-		goto out;
+	if (inode->i_nlink > 0) {
+		ret = shmem_reserve_inode(inode->i_sb);
+		if (ret)
+			goto out;
+	}
 
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
 	inc_nlink(inode);
@@ -2197,7 +2200,7 @@
 
 		spin_lock(&dentry->d_lock);
 		list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) {
-			spin_lock(&child->d_lock);
+			spin_lock_nested(&child->d_lock, 1);
 			if (d_is_whiteout(child)) {
 				__d_drop(child);
 				if (!list_empty(&child->d_lru)) {
diff --git a/security/commoncap.c b/security/commoncap.c
index b9d613e..84dc2e5 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -381,6 +381,11 @@
 	int size;
 	struct vfs_cap_data caps;
 
+#ifdef CONFIG_UNION_MOUNT
+	if (!inode)
+		inode = dentry->d_fallthru->d_inode;
+#endif
+
 	memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));
 
 	if (!inode || !inode->i_op->getxattr)
diff --git a/security/security.c b/security/security.c
index 15b6928..5f09731 100644
--- a/security/security.c
+++ b/security/security.c
@@ -433,10 +433,13 @@
 }
 
 int security_path_rename(struct path *old_dir, struct dentry *old_dentry,
-			 struct path *new_dir, struct dentry *new_dentry)
+			 struct path *new_dir, struct dentry *new_dentry,
+			 struct inode *old_inode)
 {
-	if (unlikely(IS_PRIVATE(old_dentry->d_inode) ||
-		     (new_dentry->d_inode && IS_PRIVATE(new_dentry->d_inode))))
+	struct inode *new_inode = d_inode_or_lower(new_dentry);
+	
+	if (unlikely(IS_PRIVATE(old_inode) ||
+		     (new_inode && IS_PRIVATE(new_inode))))
 		return 0;
 	return security_ops->path_rename(old_dir, old_dentry, new_dir,
 					 new_dentry);
@@ -526,8 +529,9 @@
 int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry,
 			   struct inode *new_dir, struct dentry *new_dentry)
 {
-        if (unlikely(IS_PRIVATE(old_dentry->d_inode) ||
-            (new_dentry->d_inode && IS_PRIVATE(new_dentry->d_inode))))
+        if (unlikely(IS_PRIVATE(d_inode_or_lower(old_dentry)) ||
+		     (d_inode_or_lower(new_dentry) &&
+		      IS_PRIVATE(d_inode_or_lower(new_dentry)))))
 		return 0;
 	return security_ops->inode_rename(old_dir, old_dentry,
 					   new_dir, new_dentry);
@@ -535,7 +539,7 @@
 
 int security_inode_readlink(struct dentry *dentry)
 {
-	if (unlikely(IS_PRIVATE(dentry->d_inode)))
+	if (unlikely(IS_PRIVATE(d_inode_or_lower(dentry))))
 		return 0;
 	return security_ops->inode_readlink(dentry);
 }
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 57b0b49..fd77820 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1601,6 +1601,11 @@
 	struct inode *inode = dentry->d_inode;
 	struct common_audit_data ad;
 
+#ifdef CONFIG_UNION_MOUNT
+	if (unlikely(!inode) && dentry->d_fallthru)
+		inode = dentry->d_fallthru->d_inode;
+#endif
+
 	ad.type = LSM_AUDIT_DATA_DENTRY;
 	ad.u.dentry = dentry;
 	return inode_has_perm(cred, inode, av, &ad);
@@ -1616,6 +1621,11 @@
 	struct inode *inode = path->dentry->d_inode;
 	struct common_audit_data ad;
 
+#ifdef CONFIG_UNION_MOUNT
+	if (unlikely(!inode) && path->dentry->d_fallthru)
+		inode = path->dentry->d_fallthru->d_inode;
+#endif
+
 	ad.type = LSM_AUDIT_DATA_PATH;
 	ad.u.path = *path;
 	return inode_has_perm(cred, inode, av, &ad);