fs/xfs/xfs_healthmon.c - linux/kernel/git/klassert/ipsec-next - Git at Google

 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
  * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #include "xfs_platform.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_trace.h"
 #include "xfs_ag.h"
 #include "xfs_btree.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_quota_defs.h"
 #include "xfs_rtgroup.h"
 #include "xfs_health.h"
 #include "xfs_healthmon.h"
 #include "xfs_fsops.h"
 #include "xfs_notify_failure.h"
 #include "xfs_file.h"
 #include "xfs_ioctl.h"

 #include <linux/anon_inodes.h>
 #include <linux/eventpoll.h>
 #include <linux/poll.h>
 #include <linux/fserror.h>

 /*
  * Live Health Monitoring
  * ======================
  *
  * Autonomous self-healing of XFS filesystems requires a means for the kernel
  * to send filesystem health events to a monitoring daemon in userspace.  To
  * accomplish this, we establish a thread_with_file kthread object to handle
  * translating internal events about filesystem health into a format that can
  * be parsed easily by userspace.  When those internal events occur, the core
  * filesystem code calls this health monitor to convey the events to userspace.
  * Userspace reads events from the file descriptor returned by the ioctl.
  *
  * The healthmon abstraction has a weak reference to the host filesystem mount
  * so that the queueing and processing of the events do not pin the mount and
  * cannot slow down the main filesystem.  The healthmon object can exist past
  * the end of the filesystem mount.
  */

 /* sign of a detached health monitor */
 #define DETACHED_MOUNT_COOKIE		((uintptr_t)0)

 /* Constrain the number of event objects that can build up in memory. */
 #define XFS_HEALTHMON_MAX_EVENTS	(SZ_32K / \
 					 sizeof(struct xfs_healthmon_event))

 /* Constrain the size of the output buffer for read_iter. */
 #define XFS_HEALTHMON_MAX_OUTBUF	SZ_64K

 /* spinlock for atomically updating xfs_mount <-> xfs_healthmon pointers */
 static DEFINE_SPINLOCK(xfs_healthmon_lock);

 /* Grab a reference to the healthmon object for a given mount, if any. */
 static struct xfs_healthmon *
 xfs_healthmon_get(
 	struct xfs_mount		*mp)
 {
 	struct xfs_healthmon		*hm;

 	rcu_read_lock();
 	hm = mp->m_healthmon;
 	if (hm && !refcount_inc_not_zero(&hm->ref))
 		hm = NULL;
 	rcu_read_unlock();

 	return hm;
 }

 /*
  * Release the reference to a healthmon object.  If there are no more holders,
  * free the health monitor after an RCU grace period to eliminate possibility
  * of races with xfs_healthmon_get.
  */
 static void
 xfs_healthmon_put(
 	struct xfs_healthmon		*hm)
 {
 	if (refcount_dec_and_test(&hm->ref)) {
 		struct xfs_healthmon_event	*event;
 		struct xfs_healthmon_event	*next = hm->first_event;

 		while ((event = next) != NULL) {
 			trace_xfs_healthmon_drop(hm, event);
 			next = event->next;
 			kfree(event);
 		}

 		kfree(hm->unmount_event);
 		kfree(hm->buffer);
 		mutex_destroy(&hm->lock);
 		kfree_rcu_mightsleep(hm);
 	}
 }

 /* Attach a health monitor to an xfs_mount.  Only one allowed at a time. */
 STATIC int
 xfs_healthmon_attach(
 	struct xfs_mount	*mp,
 	struct xfs_healthmon	*hm)
 {
 	spin_lock(&xfs_healthmon_lock);
 	if (mp->m_healthmon != NULL) {
 		spin_unlock(&xfs_healthmon_lock);
 		return -EEXIST;
 	}

 	refcount_inc(&hm->ref);
 	mp->m_healthmon = hm;
 	hm->mount_cookie = (uintptr_t)mp->m_super;
 	spin_unlock(&xfs_healthmon_lock);

 	return 0;
 }

 /* Detach a xfs mount from a specific healthmon instance. */
 STATIC void
 xfs_healthmon_detach(
 	struct xfs_healthmon	*hm)
 {
 	spin_lock(&xfs_healthmon_lock);
 	if (hm->mount_cookie == DETACHED_MOUNT_COOKIE) {
 		spin_unlock(&xfs_healthmon_lock);
 		return;
 	}

 	XFS_M((struct super_block *)hm->mount_cookie)->m_healthmon = NULL;
 	hm->mount_cookie = DETACHED_MOUNT_COOKIE;
 	spin_unlock(&xfs_healthmon_lock);

 	trace_xfs_healthmon_detach(hm);
 	xfs_healthmon_put(hm);
 }

 static inline void xfs_healthmon_bump_events(struct xfs_healthmon *hm)
 {
 	hm->events++;
 	hm->total_events++;
 }

 static inline void xfs_healthmon_bump_lost(struct xfs_healthmon *hm)
 {
 	hm->lost_prev_event++;
 	hm->total_lost++;
 }

 /*
  * If possible, merge a new event into an existing event.  Returns whether or
  * not it merged anything.
  */
 static bool
 xfs_healthmon_merge_events(
 	struct xfs_healthmon_event		*existing,
 	const struct xfs_healthmon_event	*new)
 {
 	if (!existing)
 		return false;

 	/* type and domain must match to merge events */
 	if (existing->type != new->type ||
 	    existing->domain != new->domain)
 		return false;

 	switch (existing->type) {
 	case XFS_HEALTHMON_RUNNING:
 	case XFS_HEALTHMON_UNMOUNT:
 		/* should only ever be one of these events anyway */
 		return false;

 	case XFS_HEALTHMON_LOST:
 		existing->lostcount += new->lostcount;
 		return true;

 	case XFS_HEALTHMON_SICK:
 	case XFS_HEALTHMON_CORRUPT:
 	case XFS_HEALTHMON_HEALTHY:
 		switch (existing->domain) {
 		case XFS_HEALTHMON_FS:
 			existing->fsmask |= new->fsmask;
 			return true;
 		case XFS_HEALTHMON_AG:
 		case XFS_HEALTHMON_RTGROUP:
 			if (existing->group == new->group){
 				existing->grpmask |= new->grpmask;
 				return true;
 			}
 			return false;
 		case XFS_HEALTHMON_INODE:
 			if (existing->ino == new->ino &&
 			    existing->gen == new->gen) {
 				existing->imask |= new->imask;
 				return true;
 			}
 			return false;
 		default:
 			ASSERT(0);
 			return false;
 		}
 		return false;

 	case XFS_HEALTHMON_SHUTDOWN:
 		/* yes, we can race to shutdown */
 		existing->flags |= new->flags;
 		return true;

 	case XFS_HEALTHMON_MEDIA_ERROR:
 		/* physically adjacent errors can merge */
 		if (existing->daddr + existing->bbcount == new->daddr) {
 			existing->bbcount += new->bbcount;
 			return true;
 		}
 		if (new->daddr + new->bbcount == existing->daddr) {
 			existing->daddr = new->daddr;
 			existing->bbcount += new->bbcount;
 			return true;
 		}
 		return false;

 	case XFS_HEALTHMON_BUFREAD:
 	case XFS_HEALTHMON_BUFWRITE:
 	case XFS_HEALTHMON_DIOREAD:
 	case XFS_HEALTHMON_DIOWRITE:
 	case XFS_HEALTHMON_DATALOST:
 		/* logically adjacent file ranges can merge */
 		if (existing->fino != new->fino || existing->fgen != new->fgen)
 			return false;

 		if (existing->fpos + existing->flen == new->fpos) {
 			existing->flen += new->flen;
 			return true;
 		}

 		if (new->fpos + new->flen == existing->fpos) {
 			existing->fpos = new->fpos;
 			existing->flen += new->flen;
 			return true;
 		}
 		return false;
 	}

 	return false;
 }

 /* Insert an event onto the start of the queue. */
 static inline void
 __xfs_healthmon_insert(
 	struct xfs_healthmon		*hm,
 	struct xfs_healthmon_event	*event)
 {
 	struct timespec64		now;

 	ktime_get_coarse_real_ts64(&now);
 	event->time_ns = (now.tv_sec * NSEC_PER_SEC) + now.tv_nsec;

 	event->next = hm->first_event;
 	if (!hm->first_event)
 		hm->first_event = event;
 	if (!hm->last_event)
 		hm->last_event = event;
 	xfs_healthmon_bump_events(hm);
 	wake_up(&hm->wait);

 	trace_xfs_healthmon_insert(hm, event);
 }

 /* Push an event onto the end of the queue. */
 static inline void
 __xfs_healthmon_push(
 	struct xfs_healthmon		*hm,
 	struct xfs_healthmon_event	*event)
 {
 	struct timespec64		now;

 	ktime_get_coarse_real_ts64(&now);
 	event->time_ns = (now.tv_sec * NSEC_PER_SEC) + now.tv_nsec;

 	if (!hm->first_event)
 		hm->first_event = event;
 	if (hm->last_event)
 		hm->last_event->next = event;
 	hm->last_event = event;
 	event->next = NULL;
 	xfs_healthmon_bump_events(hm);
 	wake_up(&hm->wait);

 	trace_xfs_healthmon_push(hm, event);
 }

 /* Deal with any previously lost events */
 static int
 xfs_healthmon_clear_lost_prev(
 	struct xfs_healthmon		*hm)
 {
 	struct xfs_healthmon_event	lost_event = {
 		.type			= XFS_HEALTHMON_LOST,
 		.domain			= XFS_HEALTHMON_MOUNT,
 		.lostcount		= hm->lost_prev_event,
 	};
 	struct xfs_healthmon_event	*event = NULL;

 	if (xfs_healthmon_merge_events(hm->last_event, &lost_event)) {
 		trace_xfs_healthmon_merge(hm, hm->last_event);
 		wake_up(&hm->wait);
 		goto cleared;
 	}

 	if (hm->events < XFS_HEALTHMON_MAX_EVENTS)
 		event = kmemdup(&lost_event, sizeof(struct xfs_healthmon_event),
 				GFP_NOFS);
 	if (!event)
 		return -ENOMEM;

 	__xfs_healthmon_push(hm, event);
 cleared:
 	hm->lost_prev_event = 0;
 	return 0;
 }

 /*
  * Push an event onto the end of the list after dealing with lost events and
  * possibly full queues.
  */
 STATIC int
 xfs_healthmon_push(
 	struct xfs_healthmon			*hm,
 	const struct xfs_healthmon_event	*template)
 {
 	struct xfs_healthmon_event		*event = NULL;
 	int					error = 0;

 	/*
 	 * Locklessly check if the health monitor has already detached from the
 	 * mount.  If so, ignore the event.  If we race with deactivation,
 	 * we'll queue the event but never send it.
 	 */
 	if (hm->mount_cookie == DETACHED_MOUNT_COOKIE)
 		return -ESHUTDOWN;

 	mutex_lock(&hm->lock);

 	/* Report previously lost events before we do anything else */
 	if (hm->lost_prev_event) {
 		error = xfs_healthmon_clear_lost_prev(hm);
 		if (error)
 			goto out_unlock;
 	}

 	/* Try to merge with the newest event */
 	if (xfs_healthmon_merge_events(hm->last_event, template)) {
 		trace_xfs_healthmon_merge(hm, hm->last_event);
 		wake_up(&hm->wait);
 		goto out_unlock;
 	}

 	/* Only create a heap event object if we're not already at capacity. */
 	if (hm->events < XFS_HEALTHMON_MAX_EVENTS)
 		event = kmemdup(template, sizeof(struct xfs_healthmon_event),
 				GFP_NOFS);
 	if (!event) {
 		/* No memory means we lose the event */
 		trace_xfs_healthmon_lost_event(hm);
 		xfs_healthmon_bump_lost(hm);
 		error = -ENOMEM;
 		goto out_unlock;
 	}

 	__xfs_healthmon_push(hm, event);

 out_unlock:
 	mutex_unlock(&hm->lock);
 	return error;
 }

 /*
  * Report that the filesystem is being unmounted, then detach the xfs mount
  * from this healthmon instance.
  */
 void
 xfs_healthmon_unmount(
 	struct xfs_mount		*mp)
 {
 	struct xfs_healthmon		*hm = xfs_healthmon_get(mp);

 	if (!hm)
 		return;

 	trace_xfs_healthmon_report_unmount(hm);

 	/*
 	 * Insert the unmount notification at the start of the event queue so
 	 * that userspace knows the filesystem went away as soon as possible.
 	 * There's nothing actionable for userspace after an unmount.  Once
 	 * we've inserted the unmount event, hm no longer owns that event.
 	 */
 	__xfs_healthmon_insert(hm, hm->unmount_event);
 	hm->unmount_event = NULL;

 	xfs_healthmon_detach(hm);
 	xfs_healthmon_put(hm);
 }

 /* Compute the reporting mask for non-unmount metadata health events. */
 static inline unsigned int
 metadata_event_mask(
 	struct xfs_healthmon		*hm,
 	enum xfs_healthmon_type		type,
 	unsigned int			old_mask,
 	unsigned int			new_mask)
 {
 	/* If we want all events, return all events. */
 	if (hm->verbose)
 		return new_mask;

 	switch (type) {
 	case XFS_HEALTHMON_SICK:
 		/* Always report runtime corruptions */
 		return new_mask;
 	case XFS_HEALTHMON_CORRUPT:
 		/* Only report new fsck errors */
 		return new_mask & ~old_mask;
 	case XFS_HEALTHMON_HEALTHY:
 		/* Only report healthy metadata that got fixed */
 		return new_mask & old_mask;
 	default:
 		ASSERT(0);
 		break;
 	}

 	return 0;
 }

 /* Report XFS_FS_SICK_* events to healthmon */
 void
 xfs_healthmon_report_fs(
 	struct xfs_mount		*mp,
 	enum xfs_healthmon_type		type,
 	unsigned int			old_mask,
 	unsigned int			new_mask)
 {
 	struct xfs_healthmon_event	event = {
 		.type			= type,
 		.domain			= XFS_HEALTHMON_FS,
 	};
 	struct xfs_healthmon		*hm = xfs_healthmon_get(mp);

 	if (!hm)
 		return;

 	event.fsmask = metadata_event_mask(hm, type, old_mask, new_mask) &
 			~XFS_SICK_FS_SECONDARY;
 	trace_xfs_healthmon_report_fs(hm, old_mask, new_mask, &event);

 	if (event.fsmask)
 		xfs_healthmon_push(hm, &event);

 	xfs_healthmon_put(hm);
 }

 /* Report XFS_SICK_(AG|RG)* flags to healthmon */
 void
 xfs_healthmon_report_group(
 	struct xfs_group		*xg,
 	enum xfs_healthmon_type		type,
 	unsigned int			old_mask,
 	unsigned int			new_mask)
 {
 	struct xfs_healthmon_event	event = {
 		.type			= type,
 		.group			= xg->xg_gno,
 	};
 	struct xfs_healthmon		*hm = xfs_healthmon_get(xg->xg_mount);

 	if (!hm)
 		return;

 	switch (xg->xg_type) {
 	case XG_TYPE_RTG:
 		event.domain = XFS_HEALTHMON_RTGROUP;
 		event.grpmask = metadata_event_mask(hm, type, old_mask,
 						    new_mask) &
 				~XFS_SICK_RG_SECONDARY;
 		break;
 	case XG_TYPE_AG:
 		event.domain = XFS_HEALTHMON_AG;
 		event.grpmask = metadata_event_mask(hm, type, old_mask,
 						    new_mask) &
 				~XFS_SICK_AG_SECONDARY;
 		break;
 	default:
 		ASSERT(0);
 		break;
 	}

 	trace_xfs_healthmon_report_group(hm, old_mask, new_mask, &event);

 	if (event.grpmask)
 		xfs_healthmon_push(hm, &event);

 	xfs_healthmon_put(hm);
 }

 /* Report XFS_SICK_INO_* flags to healthmon */
 void
 xfs_healthmon_report_inode(
 	struct xfs_inode		*ip,
 	enum xfs_healthmon_type		type,
 	unsigned int			old_mask,
 	unsigned int			new_mask)
 {
 	struct xfs_healthmon_event	event = {
 		.type			= type,
 		.domain			= XFS_HEALTHMON_INODE,
 		.ino			= ip->i_ino,
 		.gen			= VFS_I(ip)->i_generation,
 	};
 	struct xfs_healthmon		*hm = xfs_healthmon_get(ip->i_mount);

 	if (!hm)
 		return;

 	event.imask = metadata_event_mask(hm, type, old_mask, new_mask) &
 			~XFS_SICK_INO_SECONDARY;
 	trace_xfs_healthmon_report_inode(hm, old_mask, event.imask, &event);

 	if (event.imask)
 		xfs_healthmon_push(hm, &event);

 	xfs_healthmon_put(hm);
 }

 /* Add a shutdown event to the reporting queue. */
 void
 xfs_healthmon_report_shutdown(
 	struct xfs_mount		*mp,
 	uint32_t			flags)
 {
 	struct xfs_healthmon_event	event = {
 		.type			= XFS_HEALTHMON_SHUTDOWN,
 		.domain			= XFS_HEALTHMON_MOUNT,
 		.flags			= flags,
 	};
 	struct xfs_healthmon		*hm = xfs_healthmon_get(mp);

 	if (!hm)
 		return;

 	trace_xfs_healthmon_report_shutdown(hm, flags);

 	xfs_healthmon_push(hm, &event);
 	xfs_healthmon_put(hm);
 }

 static inline enum xfs_healthmon_domain
 media_error_domain(
 	enum xfs_device			fdev)
 {
 	switch (fdev) {
 	case XFS_DEV_DATA:
 		return XFS_HEALTHMON_DATADEV;
 	case XFS_DEV_LOG:
 		return XFS_HEALTHMON_LOGDEV;
 	case XFS_DEV_RT:
 		return XFS_HEALTHMON_RTDEV;
 	}

 	ASSERT(0);
 	return 0;
 }

 /* Add a media error event to the reporting queue. */
 void
 xfs_healthmon_report_media(
 	struct xfs_mount		*mp,
 	enum xfs_device			fdev,
 	xfs_daddr_t			daddr,
 	uint64_t			bbcount)
 {
 	struct xfs_healthmon_event	event = {
 		.type			= XFS_HEALTHMON_MEDIA_ERROR,
 		.domain			= media_error_domain(fdev),
 		.daddr			= daddr,
 		.bbcount		= bbcount,
 	};
 	struct xfs_healthmon		*hm = xfs_healthmon_get(mp);

 	if (!hm)
 		return;

 	trace_xfs_healthmon_report_media(hm, fdev, &event);

 	xfs_healthmon_push(hm, &event);
 	xfs_healthmon_put(hm);
 }

 static inline enum xfs_healthmon_type file_ioerr_type(enum fserror_type action)
 {
 	switch (action) {
 	case FSERR_BUFFERED_READ:
 		return XFS_HEALTHMON_BUFREAD;
 	case FSERR_BUFFERED_WRITE:
 		return XFS_HEALTHMON_BUFWRITE;
 	case FSERR_DIRECTIO_READ:
 		return XFS_HEALTHMON_DIOREAD;
 	case FSERR_DIRECTIO_WRITE:
 		return XFS_HEALTHMON_DIOWRITE;
 	case FSERR_DATA_LOST:
 		return XFS_HEALTHMON_DATALOST;
 	case FSERR_METADATA:
 		/* filtered out by xfs_fs_report_error */
 		break;
 	}

 	ASSERT(0);
 	return -1;
 }

 /* Add a file io error event to the reporting queue. */
 void
 xfs_healthmon_report_file_ioerror(
 	struct xfs_inode		*ip,
 	const struct fserror_event	*p)
 {
 	struct xfs_healthmon_event	event = {
 		.type			= file_ioerr_type(p->type),
 		.domain			= XFS_HEALTHMON_FILERANGE,
 		.fino			= ip->i_ino,
 		.fgen			= VFS_I(ip)->i_generation,
 		.fpos			= p->pos,
 		.flen			= p->len,
 		/* send positive error number to userspace */
 		.error			= -p->error,
 	};
 	struct xfs_healthmon		*hm = xfs_healthmon_get(ip->i_mount);

 	if (!hm)
 		return;

 	trace_xfs_healthmon_report_file_ioerror(hm, p);

 	xfs_healthmon_push(hm, &event);
 	xfs_healthmon_put(hm);
 }

 static inline void
 xfs_healthmon_reset_outbuf(
 	struct xfs_healthmon		*hm)
 {
 	hm->buftail = 0;
 	hm->bufhead = 0;
 }

 struct flags_map {
 	unsigned int		in_mask;
 	unsigned int		out_mask;
 };

 static const struct flags_map shutdown_map[] = {
 	{ SHUTDOWN_META_IO_ERROR,	XFS_HEALTH_SHUTDOWN_META_IO_ERROR },
 	{ SHUTDOWN_LOG_IO_ERROR,	XFS_HEALTH_SHUTDOWN_LOG_IO_ERROR },
 	{ SHUTDOWN_FORCE_UMOUNT,	XFS_HEALTH_SHUTDOWN_FORCE_UMOUNT },
 	{ SHUTDOWN_CORRUPT_INCORE,	XFS_HEALTH_SHUTDOWN_CORRUPT_INCORE },
 	{ SHUTDOWN_CORRUPT_ONDISK,	XFS_HEALTH_SHUTDOWN_CORRUPT_ONDISK },
 	{ SHUTDOWN_DEVICE_REMOVED,	XFS_HEALTH_SHUTDOWN_DEVICE_REMOVED },
 };

 static inline unsigned int
 __map_flags(
 	const struct flags_map	*map,
 	size_t			array_len,
 	unsigned int		flags)
 {
 	const struct flags_map	*m;
 	unsigned int		ret = 0;

 	for (m = map; m < map + array_len; m++) {
 		if (flags & m->in_mask)
 			ret |= m->out_mask;
 	}

 	return ret;
 }

 #define map_flags(map, flags) __map_flags((map), ARRAY_SIZE(map), (flags))

 static inline unsigned int shutdown_mask(unsigned int in)
 {
 	return map_flags(shutdown_map, in);
 }

 static const unsigned int domain_map[] = {
 	[XFS_HEALTHMON_MOUNT]		= XFS_HEALTH_MONITOR_DOMAIN_MOUNT,
 	[XFS_HEALTHMON_FS]		= XFS_HEALTH_MONITOR_DOMAIN_FS,
 	[XFS_HEALTHMON_AG]		= XFS_HEALTH_MONITOR_DOMAIN_AG,
 	[XFS_HEALTHMON_INODE]		= XFS_HEALTH_MONITOR_DOMAIN_INODE,
 	[XFS_HEALTHMON_RTGROUP]		= XFS_HEALTH_MONITOR_DOMAIN_RTGROUP,
 	[XFS_HEALTHMON_DATADEV]		= XFS_HEALTH_MONITOR_DOMAIN_DATADEV,
 	[XFS_HEALTHMON_RTDEV]		= XFS_HEALTH_MONITOR_DOMAIN_RTDEV,
 	[XFS_HEALTHMON_LOGDEV]		= XFS_HEALTH_MONITOR_DOMAIN_LOGDEV,
 	[XFS_HEALTHMON_FILERANGE]	= XFS_HEALTH_MONITOR_DOMAIN_FILERANGE,
 };

 static const unsigned int type_map[] = {
 	[XFS_HEALTHMON_RUNNING]		= XFS_HEALTH_MONITOR_TYPE_RUNNING,
 	[XFS_HEALTHMON_LOST]		= XFS_HEALTH_MONITOR_TYPE_LOST,
 	[XFS_HEALTHMON_SICK]		= XFS_HEALTH_MONITOR_TYPE_SICK,
 	[XFS_HEALTHMON_CORRUPT]		= XFS_HEALTH_MONITOR_TYPE_CORRUPT,
 	[XFS_HEALTHMON_HEALTHY]		= XFS_HEALTH_MONITOR_TYPE_HEALTHY,
 	[XFS_HEALTHMON_UNMOUNT]		= XFS_HEALTH_MONITOR_TYPE_UNMOUNT,
 	[XFS_HEALTHMON_SHUTDOWN]	= XFS_HEALTH_MONITOR_TYPE_SHUTDOWN,
 	[XFS_HEALTHMON_MEDIA_ERROR]	= XFS_HEALTH_MONITOR_TYPE_MEDIA_ERROR,
 	[XFS_HEALTHMON_BUFREAD]		= XFS_HEALTH_MONITOR_TYPE_BUFREAD,
 	[XFS_HEALTHMON_BUFWRITE]	= XFS_HEALTH_MONITOR_TYPE_BUFWRITE,
 	[XFS_HEALTHMON_DIOREAD]		= XFS_HEALTH_MONITOR_TYPE_DIOREAD,
 	[XFS_HEALTHMON_DIOWRITE]	= XFS_HEALTH_MONITOR_TYPE_DIOWRITE,
 	[XFS_HEALTHMON_DATALOST]	= XFS_HEALTH_MONITOR_TYPE_DATALOST,
 };

 /* Render event as a V0 structure */
 STATIC int
 xfs_healthmon_format_v0(
 	struct xfs_healthmon		*hm,
 	const struct xfs_healthmon_event *event)
 {
 	struct xfs_health_monitor_event	hme = {
 		.time_ns		= event->time_ns,
 	};

 	trace_xfs_healthmon_format(hm, event);

 	if (event->domain < 0 || event->domain >= ARRAY_SIZE(domain_map) ||
 	    event->type < 0   || event->type >= ARRAY_SIZE(type_map))
 		return -EFSCORRUPTED;

 	hme.domain = domain_map[event->domain];
 	hme.type = type_map[event->type];

 	/* fill in the event-specific details */
 	switch (event->domain) {
 	case XFS_HEALTHMON_MOUNT:
 		switch (event->type) {
 		case XFS_HEALTHMON_LOST:
 			hme.e.lost.count = event->lostcount;
 			break;
 		case XFS_HEALTHMON_SHUTDOWN:
 			hme.e.shutdown.reasons = shutdown_mask(event->flags);
 			break;
 		default:
 			break;
 		}
 		break;
 	case XFS_HEALTHMON_FS:
 		hme.e.fs.mask = xfs_healthmon_fs_mask(event->fsmask);
 		break;
 	case XFS_HEALTHMON_RTGROUP:
 		hme.e.group.mask = xfs_healthmon_rtgroup_mask(event->grpmask);
 		hme.e.group.gno = event->group;
 		break;
 	case XFS_HEALTHMON_AG:
 		hme.e.group.mask = xfs_healthmon_perag_mask(event->grpmask);
 		hme.e.group.gno = event->group;
 		break;
 	case XFS_HEALTHMON_INODE:
 		hme.e.inode.mask = xfs_healthmon_inode_mask(event->imask);
 		hme.e.inode.ino = event->ino;
 		hme.e.inode.gen = event->gen;
 		break;
 	case XFS_HEALTHMON_DATADEV:
 	case XFS_HEALTHMON_LOGDEV:
 	case XFS_HEALTHMON_RTDEV:
 		hme.e.media.daddr = event->daddr;
 		hme.e.media.bbcount = event->bbcount;
 		break;
 	case XFS_HEALTHMON_FILERANGE:
 		hme.e.filerange.ino = event->fino;
 		hme.e.filerange.gen = event->fgen;
 		hme.e.filerange.pos = event->fpos;
 		hme.e.filerange.len = event->flen;
 		hme.e.filerange.error = abs(event->error);
 		break;
 	default:
 		break;
 	}

 	ASSERT(hm->bufhead + sizeof(hme) <= hm->bufsize);

 	/* copy formatted object to the outbuf */
 	if (hm->bufhead + sizeof(hme) <= hm->bufsize) {
 		memcpy(hm->buffer + hm->bufhead, &hme, sizeof(hme));
 		hm->bufhead += sizeof(hme);
 	}

 	return 0;
 }

 /* How many bytes are waiting in the outbuf to be copied? */
 static inline size_t
 xfs_healthmon_outbuf_bytes(
 	struct xfs_healthmon	*hm)
 {
 	if (hm->bufhead > hm->buftail)
 		return hm->bufhead - hm->buftail;
 	return 0;
 }

 /*
  * Do we have something for userspace to read?  This can mean unmount events,
  * events pending in the queue, or pending bytes in the outbuf.
  */
 static inline bool
 xfs_healthmon_has_eventdata(
 	struct xfs_healthmon	*hm)
 {
 	/*
 	 * If the health monitor is already detached from the xfs_mount, we
 	 * want reads to return 0 bytes even if there are no events, because
 	 * userspace interprets that as EOF.  If we race with deactivation,
 	 * read_iter will take the necessary locks to discover that there are
 	 * no events to send.
 	 */
 	if (hm->mount_cookie == DETACHED_MOUNT_COOKIE)
 		return true;

 	/*
 	 * Either there are events waiting to be formatted into the buffer, or
 	 * there's unread bytes in the buffer.
 	 */
 	return hm->events > 0 || xfs_healthmon_outbuf_bytes(hm) > 0;
 }

 /* Try to copy the rest of the outbuf to the iov iter. */
 STATIC ssize_t
 xfs_healthmon_copybuf(
 	struct xfs_healthmon	*hm,
 	struct iov_iter		*to)
 {
 	size_t			to_copy;
 	size_t			w = 0;

 	trace_xfs_healthmon_copybuf(hm, to);

 	to_copy = xfs_healthmon_outbuf_bytes(hm);
 	if (to_copy) {
 		w = copy_to_iter(hm->buffer + hm->buftail, to_copy, to);
 		if (!w)
 			return -EFAULT;

 		hm->buftail += w;
 	}

 	/*
 	 * Nothing left to copy?  Reset the output buffer cursors to the start
 	 * since there's no live data in the buffer.
 	 */
 	if (xfs_healthmon_outbuf_bytes(hm) == 0)
 		xfs_healthmon_reset_outbuf(hm);
 	return w;
 }

 /*
  * Return a health monitoring event for formatting into the output buffer if
  * there's enough space in the outbuf and an event waiting for us.  Caller
  * must hold i_rwsem on the healthmon file.
  */
 static inline struct xfs_healthmon_event *
 xfs_healthmon_format_pop(
 	struct xfs_healthmon	*hm)
 {
 	struct xfs_healthmon_event *event;

 	if (hm->bufhead + sizeof(*event) > hm->bufsize)
 		return NULL;

 	mutex_lock(&hm->lock);
 	event = hm->first_event;
 	if (event) {
 		if (hm->last_event == event)
 			hm->last_event = NULL;
 		hm->first_event = event->next;
 		hm->events--;

 		trace_xfs_healthmon_pop(hm, event);
 	}
 	mutex_unlock(&hm->lock);
 	return event;
 }

 /* Allocate formatting buffer */
 STATIC int
 xfs_healthmon_alloc_outbuf(
 	struct xfs_healthmon	*hm,
 	size_t			user_bufsize)
 {
 	void			*outbuf;
 	size_t			bufsize =
 		min(XFS_HEALTHMON_MAX_OUTBUF, max(PAGE_SIZE, user_bufsize));

 	outbuf = kzalloc(bufsize, GFP_KERNEL);
 	if (!outbuf) {
 		if (bufsize == PAGE_SIZE)
 			return -ENOMEM;

 		bufsize = PAGE_SIZE;
 		outbuf = kzalloc(bufsize, GFP_KERNEL);
 		if (!outbuf)
 			return -ENOMEM;
 	}

 	hm->buffer = outbuf;
 	hm->bufsize = bufsize;
 	hm->bufhead = 0;
 	hm->buftail = 0;

 	return 0;
 }

 /*
  * Convey queued event data to userspace.  First copy any remaining bytes in
  * the outbuf, then format the oldest event into the outbuf and copy that too.
  */
 STATIC ssize_t
 xfs_healthmon_read_iter(
 	struct kiocb		*iocb,
 	struct iov_iter		*to)
 {
 	struct file		*file = iocb->ki_filp;
 	struct inode		*inode = file_inode(file);
 	struct xfs_healthmon	*hm = file->private_data;
 	struct xfs_healthmon_event *event;
 	size_t			copied = 0;
 	ssize_t			ret = 0;

 	if (file->f_flags & O_NONBLOCK) {
 		if (!xfs_healthmon_has_eventdata(hm) || !inode_trylock(inode))
 			return -EAGAIN;
 	} else {
 		ret = wait_event_interruptible(hm->wait,
 				xfs_healthmon_has_eventdata(hm));
 		if (ret)
 			return ret;

 		inode_lock(inode);
 	}

 	if (hm->bufsize == 0) {
 		ret = xfs_healthmon_alloc_outbuf(hm, iov_iter_count(to));
 		if (ret)
 			goto out_unlock;
 	}

 	trace_xfs_healthmon_read_start(hm);

 	/*
 	 * If there's anything left in the output buffer, copy that before
 	 * formatting more events.
 	 */
 	ret = xfs_healthmon_copybuf(hm, to);
 	if (ret < 0)
 		goto out_unlock;
 	copied += ret;

 	while (iov_iter_count(to) > 0) {
 		/* Format the next events into the outbuf until it's full. */
 		while ((event = xfs_healthmon_format_pop(hm)) != NULL) {
 			ret = xfs_healthmon_format_v0(hm, event);
 			kfree(event);
 			if (ret)
 				goto out_unlock;
 		}

 		/* Copy anything formatted into outbuf to userspace */
 		ret = xfs_healthmon_copybuf(hm, to);
 		if (ret <= 0)
 			break;

 		copied += ret;
 	}

 out_unlock:
 	trace_xfs_healthmon_read_finish(hm);
 	inode_unlock(inode);
 	return copied ?: ret;
 }

 /* Poll for available events. */
 STATIC __poll_t
 xfs_healthmon_poll(
 	struct file			*file,
 	struct poll_table_struct	*wait)
 {
 	struct xfs_healthmon		*hm = file->private_data;
 	__poll_t			mask = 0;

 	poll_wait(file, &hm->wait, wait);

 	if (xfs_healthmon_has_eventdata(hm))
 		mask |= EPOLLIN;
 	return mask;
 }

 /* Free the health monitoring information. */
 STATIC int
 xfs_healthmon_release(
 	struct inode		*inode,
 	struct file		*file)
 {
 	struct xfs_healthmon	*hm = file->private_data;

 	trace_xfs_healthmon_release(hm);

 	/*
 	 * We might be closing the healthmon file before the filesystem
 	 * unmounts, because userspace processes can terminate at any time and
 	 * for any reason.  Null out xfs_mount::m_healthmon so that another
 	 * process can create another health monitor file.
 	 */
 	xfs_healthmon_detach(hm);

 	/*
 	 * Wake up any readers that might be left.  There shouldn't be any
 	 * because the only users of the waiter are read and poll.
 	 */
 	wake_up_all(&hm->wait);

 	xfs_healthmon_put(hm);
 	return 0;
 }

 /* Validate ioctl parameters. */
 static inline bool
 xfs_healthmon_validate(
 	const struct xfs_health_monitor	*hmo)
 {
 	if (hmo->flags & ~XFS_HEALTH_MONITOR_ALL)
 		return false;
 	if (hmo->format != XFS_HEALTH_MONITOR_FMT_V0)
 		return false;
 	if (memchr_inv(&hmo->pad, 0, sizeof(hmo->pad)))
 		return false;
 	return true;
 }

 /* Emit some data about the health monitoring fd. */
 static void
 xfs_healthmon_show_fdinfo(
 	struct seq_file		*m,
 	struct file		*file)
 {
 	struct xfs_healthmon	*hm = file->private_data;

 	mutex_lock(&hm->lock);
 	seq_printf(m, "state:\t%s\ndev:\t%d:%d\nformat:\tv0\nevents:\t%llu\nlost:\t%llu\n",
 			hm->mount_cookie == DETACHED_MOUNT_COOKIE ?
 				"dead" : "alive",
 			MAJOR(hm->dev), MINOR(hm->dev),
 			hm->total_events,
 			hm->total_lost);
 	mutex_unlock(&hm->lock);
 }

 /* Reconfigure the health monitor. */
 STATIC long
 xfs_healthmon_reconfigure(
 	struct file			*file,
 	unsigned int			cmd,
 	void __user			*arg)
 {
 	struct xfs_health_monitor	hmo;
 	struct xfs_healthmon		*hm = file->private_data;

 	if (copy_from_user(&hmo, arg, sizeof(hmo)))
 		return -EFAULT;

 	if (!xfs_healthmon_validate(&hmo))
 		return -EINVAL;

 	mutex_lock(&hm->lock);
 	hm->verbose = !!(hmo.flags & XFS_HEALTH_MONITOR_VERBOSE);
 	mutex_unlock(&hm->lock);

 	return 0;
 }

 /* Does the fd point to the same filesystem as the one we're monitoring? */
 STATIC long
 xfs_healthmon_file_on_monitored_fs(
 	struct file			*file,
 	unsigned int			cmd,
 	void __user			*arg)
 {
 	struct xfs_health_file_on_monitored_fs hms;
 	struct xfs_healthmon		*hm = file->private_data;
 	struct inode			*hms_inode;

 	if (copy_from_user(&hms, arg, sizeof(hms)))
 		return -EFAULT;

 	if (hms.flags)
 		return -EINVAL;

 	CLASS(fd, hms_fd)(hms.fd);
 	if (fd_empty(hms_fd))
 		return -EBADF;

 	hms_inode = file_inode(fd_file(hms_fd));
 	mutex_lock(&hm->lock);
 	if (hm->mount_cookie != (uintptr_t)hms_inode->i_sb) {
 		mutex_unlock(&hm->lock);
 		return -ESTALE;
 	}

 	mutex_unlock(&hm->lock);
 	return 0;
 }

 /* Handle ioctls for the health monitoring thread. */
 STATIC long
 xfs_healthmon_ioctl(
 	struct file			*file,
 	unsigned int			cmd,
 	unsigned long			p)
 {
 	void __user			*arg = (void __user *)p;

 	switch (cmd) {
 	case XFS_IOC_HEALTH_MONITOR:
 		return xfs_healthmon_reconfigure(file, cmd, arg);
 	case XFS_IOC_HEALTH_FD_ON_MONITORED_FS:
 		return xfs_healthmon_file_on_monitored_fs(file, cmd, arg);
 	default:
 		break;
 	}

 	return -ENOTTY;
 }

 static const struct file_operations xfs_healthmon_fops = {
 	.owner		= THIS_MODULE,
 	.show_fdinfo	= xfs_healthmon_show_fdinfo,
 	.read_iter	= xfs_healthmon_read_iter,
 	.poll		= xfs_healthmon_poll,
 	.release	= xfs_healthmon_release,
 	.unlocked_ioctl	= xfs_healthmon_ioctl,
 };

 /*
  * Create a health monitoring file.  Returns an index to the fd table or a
  * negative errno.
  */
 long
 xfs_ioc_health_monitor(
 	struct file			*file,
 	struct xfs_health_monitor __user *arg)
 {
 	struct xfs_health_monitor	hmo;
 	struct xfs_healthmon_event	*running_event;
 	struct xfs_healthmon		*hm;
 	struct xfs_inode		*ip = XFS_I(file_inode(file));
 	struct xfs_mount		*mp = ip->i_mount;
 	int				ret;

 	/*
 	 * The only intended user of the health monitoring system should be the
 	 * xfs_healer daemon running on behalf of the whole filesystem in the
 	 * initial user namespace.  IOWs, we don't allow unprivileged userspace
 	 * (they can use fsnotify) nor do we allow containers.
 	 */
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	if (ip->i_ino != mp->m_sb.sb_rootino)
 		return -EPERM;
 	if (current_user_ns() != &init_user_ns)
 		return -EPERM;

 	if (copy_from_user(&hmo, arg, sizeof(hmo)))
 		return -EFAULT;

 	if (!xfs_healthmon_validate(&hmo))
 		return -EINVAL;

 	hm = kzalloc(sizeof(*hm), GFP_KERNEL);
 	if (!hm)
 		return -ENOMEM;
 	hm->dev = mp->m_super->s_dev;
 	refcount_set(&hm->ref, 1);

 	mutex_init(&hm->lock);
 	init_waitqueue_head(&hm->wait);

 	if (hmo.flags & XFS_HEALTH_MONITOR_VERBOSE)
 		hm->verbose = true;

 	/* Queue up the first event that lets the client know we're running. */
 	running_event = kzalloc(sizeof(struct xfs_healthmon_event), GFP_NOFS);
 	if (!running_event) {
 		ret = -ENOMEM;
 		goto out_hm;
 	}
 	running_event->type = XFS_HEALTHMON_RUNNING;
 	running_event->domain = XFS_HEALTHMON_MOUNT;
 	__xfs_healthmon_insert(hm, running_event);

 	/*
 	 * Preallocate the unmount event so that we can't fail to notify the
 	 * filesystem later.  This is key for triggering fast exit of the
 	 * xfs_healer daemon.
 	 */
 	hm->unmount_event = kzalloc(sizeof(struct xfs_healthmon_event),
 			GFP_NOFS);
 	if (!hm->unmount_event) {
 		ret = -ENOMEM;
 		goto out_hm;
 	}
 	hm->unmount_event->type = XFS_HEALTHMON_UNMOUNT;
 	hm->unmount_event->domain = XFS_HEALTHMON_MOUNT;

 	/*
 	 * Try to attach this health monitor to the xfs_mount.  The monitor is
 	 * considered live and will receive events if this succeeds.
 	 */
 	ret = xfs_healthmon_attach(mp, hm);
 	if (ret)
 		goto out_hm;

 	/*
 	 * Create the anonymous file and install a fd for it.  If it succeeds,
 	 * the file owns hm and can go away at any time, so we must not access
 	 * it again.  This must go last because we can't undo a fd table
 	 * installation.
 	 */
 	ret = anon_inode_getfd("xfs_healthmon", &xfs_healthmon_fops, hm,
 			O_CLOEXEC | O_RDONLY);
 	if (ret < 0)
 		goto out_mp;

 	trace_xfs_healthmon_create(mp->m_super->s_dev, hmo.flags, hmo.format);

 	return ret;

 out_mp:
 	xfs_healthmon_detach(hm);
 out_hm:
 	ASSERT(refcount_read(&hm->ref) == 1);
 	xfs_healthmon_put(hm);
 	return ret;
 }