blob: ec3478bc505ef97fa664794290071c955f6191fe [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (c) 2021-2024 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_ialloc.h"
#include "xfs_ialloc_btree.h"
#include "xfs_ag.h"
#include "xfs_error.h"
#include "xfs_bit.h"
#include "xfs_icache.h"
#include "scrub/scrub.h"
#include "scrub/iscan.h"
#include "scrub/common.h"
#include "scrub/trace.h"
/*
* Live File Scan
* ==============
*
* Live file scans walk every inode in a live filesystem. This is more or
* less like a regular iwalk, except that when we're advancing the scan cursor,
* we must ensure that inodes cannot be added or deleted anywhere between the
* old cursor value and the new cursor value. If we're advancing the cursor
* by one inode, the caller must hold that inode; if we're finding the next
* inode to scan, we must grab the AGI and hold it until we've updated the
* scan cursor.
*
* Callers are expected to use this code to scan all files in the filesystem to
* construct a new metadata index of some kind. The scan races against other
* live updates, which means there must be a provision to update the new index
* when updates are made to inodes that already been scanned. The iscan lock
* can be used in live update hook code to stop the scan and protect this data
* structure.
*
* To keep the new index up to date with other metadata updates being made to
* the live filesystem, it is assumed that the caller will add hooks as needed
* to be notified when a metadata update occurs. The inode scanner must tell
* the hook code when an inode has been visited with xchk_iscan_mark_visit.
* Hook functions can use xchk_iscan_want_live_update to decide if the
* scanner's observations must be updated.
*/
/*
* If the inobt record @rec covers @iscan->skip_ino, mark the inode free so
* that the scan ignores that inode.
*/
STATIC void
xchk_iscan_mask_skipino(
struct xchk_iscan *iscan,
struct xfs_perag *pag,
struct xfs_inobt_rec_incore *rec,
xfs_agino_t lastrecino)
{
struct xfs_scrub *sc = iscan->sc;
struct xfs_mount *mp = sc->mp;
xfs_agnumber_t skip_agno = XFS_INO_TO_AGNO(mp, iscan->skip_ino);
xfs_agnumber_t skip_agino = XFS_INO_TO_AGINO(mp, iscan->skip_ino);
if (pag->pag_agno != skip_agno)
return;
if (skip_agino < rec->ir_startino)
return;
if (skip_agino > lastrecino)
return;
rec->ir_free |= xfs_inobt_maskn(skip_agino - rec->ir_startino, 1);
}
/*
* Set *cursor to the next allocated inode after whatever it's set to now.
* If there are no more inodes in this AG, cursor is set to NULLAGINO.
*/
STATIC int
xchk_iscan_find_next(
struct xchk_iscan *iscan,
struct xfs_buf *agi_bp,
struct xfs_perag *pag,
xfs_inofree_t *allocmaskp,
xfs_agino_t *cursor,
uint8_t *nr_inodesp)
{
struct xfs_scrub *sc = iscan->sc;
struct xfs_inobt_rec_incore rec;
struct xfs_btree_cur *cur;
struct xfs_mount *mp = sc->mp;
struct xfs_trans *tp = sc->tp;
xfs_agnumber_t agno = pag->pag_agno;
xfs_agino_t lastino = NULLAGINO;
xfs_agino_t first, last;
xfs_agino_t agino = *cursor;
int has_rec;
int error;
/* If the cursor is beyond the end of this AG, move to the next one. */
xfs_agino_range(mp, agno, &first, &last);
if (agino > last) {
*cursor = NULLAGINO;
return 0;
}
/*
* Look up the inode chunk for the current cursor position. If there
* is no chunk here, we want the next one.
*/
cur = xfs_inobt_init_cursor(pag, tp, agi_bp);
error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_rec);
if (!error && !has_rec)
error = xfs_btree_increment(cur, 0, &has_rec);
for (; !error; error = xfs_btree_increment(cur, 0, &has_rec)) {
xfs_inofree_t allocmask;
/*
* If we've run out of inobt records in this AG, move the
* cursor on to the next AG and exit. The caller can try
* again with the next AG.
*/
if (!has_rec) {
*cursor = NULLAGINO;
break;
}
error = xfs_inobt_get_rec(cur, &rec, &has_rec);
if (error)
break;
if (!has_rec) {
error = -EFSCORRUPTED;
break;
}
/* Make sure that we always move forward. */
if (lastino != NULLAGINO &&
XFS_IS_CORRUPT(mp, lastino >= rec.ir_startino)) {
error = -EFSCORRUPTED;
break;
}
lastino = rec.ir_startino + XFS_INODES_PER_CHUNK - 1;
/*
* If this record only covers inodes that come before the
* cursor, advance to the next record.
*/
if (rec.ir_startino + XFS_INODES_PER_CHUNK <= agino)
continue;
if (iscan->skip_ino)
xchk_iscan_mask_skipino(iscan, pag, &rec, lastino);
/*
* If the incoming lookup put us in the middle of an inobt
* record, mark it and the previous inodes "free" so that the
* search for allocated inodes will start at the cursor.
* We don't care about ir_freecount here.
*/
if (agino >= rec.ir_startino)
rec.ir_free |= xfs_inobt_maskn(0,
agino + 1 - rec.ir_startino);
/*
* If there are allocated inodes in this chunk, find them
* and update the scan cursor.
*/
allocmask = ~rec.ir_free;
if (hweight64(allocmask) > 0) {
int next = xfs_lowbit64(allocmask);
ASSERT(next >= 0);
*cursor = rec.ir_startino + next;
*allocmaskp = allocmask >> next;
*nr_inodesp = XFS_INODES_PER_CHUNK - next;
break;
}
}
xfs_btree_del_cursor(cur, error);
return error;
}
/*
* Advance both the scan and the visited cursors.
*
* The inumber address space for a given filesystem is sparse, which means that
* the scan cursor can jump a long ways in a single iter() call. There are no
* inodes in these sparse areas, so we must move the visited cursor forward at
* the same time so that the scan user can receive live updates for inodes that
* may get created once we release the AGI buffer.
*/
static inline void
xchk_iscan_move_cursor(
struct xchk_iscan *iscan,
xfs_agnumber_t agno,
xfs_agino_t agino)
{
struct xfs_scrub *sc = iscan->sc;
struct xfs_mount *mp = sc->mp;
xfs_ino_t cursor, visited;
BUILD_BUG_ON(XFS_MAXINUMBER == NULLFSINO);
/*
* Special-case ino == 0 here so that we never set visited_ino to
* NULLFSINO when wrapping around EOFS, for that will let through all
* live updates.
*/
cursor = XFS_AGINO_TO_INO(mp, agno, agino);
if (cursor == 0)
visited = XFS_MAXINUMBER;
else
visited = cursor - 1;
mutex_lock(&iscan->lock);
iscan->cursor_ino = cursor;
iscan->__visited_ino = visited;
trace_xchk_iscan_move_cursor(iscan);
mutex_unlock(&iscan->lock);
}
/*
* Prepare to return agno/agino to the iscan caller by moving the lastino
* cursor to the previous inode. Do this while we still hold the AGI so that
* no other threads can create or delete inodes in this AG.
*/
static inline void
xchk_iscan_finish(
struct xchk_iscan *iscan)
{
mutex_lock(&iscan->lock);
iscan->cursor_ino = NULLFSINO;
/* All live updates will be applied from now on */
iscan->__visited_ino = NULLFSINO;
mutex_unlock(&iscan->lock);
}
/*
* Advance ino to the next inode that the inobt thinks is allocated, being
* careful to jump to the next AG if we've reached the right end of this AG's
* inode btree. Advancing ino effectively means that we've pushed the inode
* scan forward, so set the iscan cursor to (ino - 1) so that our live update
* predicates will track inode allocations in that part of the inode number
* key space once we release the AGI buffer.
*
* Returns 1 if there's a new inode to examine, 0 if we've run out of inodes,
* -ECANCELED if the live scan aborted, or the usual negative errno.
*/
STATIC int
xchk_iscan_advance(
struct xchk_iscan *iscan,
struct xfs_perag **pagp,
struct xfs_buf **agi_bpp,
xfs_inofree_t *allocmaskp,
uint8_t *nr_inodesp)
{
struct xfs_scrub *sc = iscan->sc;
struct xfs_mount *mp = sc->mp;
struct xfs_buf *agi_bp;
struct xfs_perag *pag;
xfs_agnumber_t agno;
xfs_agino_t agino;
int ret;
ASSERT(iscan->cursor_ino >= iscan->__visited_ino);
do {
if (xchk_iscan_aborted(iscan))
return -ECANCELED;
agno = XFS_INO_TO_AGNO(mp, iscan->cursor_ino);
pag = xfs_perag_get(mp, agno);
if (!pag)
return -ECANCELED;
ret = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp);
if (ret)
goto out_pag;
agino = XFS_INO_TO_AGINO(mp, iscan->cursor_ino);
ret = xchk_iscan_find_next(iscan, agi_bp, pag, allocmaskp,
&agino, nr_inodesp);
if (ret)
goto out_buf;
if (agino != NULLAGINO) {
/*
* Found the next inode in this AG, so return it along
* with the AGI buffer and the perag structure to
* ensure it cannot go away.
*/
xchk_iscan_move_cursor(iscan, agno, agino);
*agi_bpp = agi_bp;
*pagp = pag;
return 1;
}
/*
* Did not find any more inodes in this AG, move on to the next
* AG.
*/
agno = (agno + 1) % mp->m_sb.sb_agcount;
xchk_iscan_move_cursor(iscan, agno, 0);
xfs_trans_brelse(sc->tp, agi_bp);
xfs_perag_put(pag);
trace_xchk_iscan_advance_ag(iscan);
} while (iscan->cursor_ino != iscan->scan_start_ino);
xchk_iscan_finish(iscan);
return 0;
out_buf:
xfs_trans_brelse(sc->tp, agi_bp);
out_pag:
xfs_perag_put(pag);
return ret;
}
/*
* Grabbing the inode failed, so we need to back up the scan and ask the caller
* to try to _advance the scan again. Returns -EBUSY if we've run out of retry
* opportunities, -ECANCELED if the process has a fatal signal pending, or
* -EAGAIN if we should try again.
*/
STATIC int
xchk_iscan_iget_retry(
struct xchk_iscan *iscan,
bool wait)
{
ASSERT(iscan->cursor_ino == iscan->__visited_ino + 1);
if (!iscan->iget_timeout ||
time_is_before_jiffies(iscan->__iget_deadline))
return -EBUSY;
if (wait) {
unsigned long relax;
/*
* Sleep for a period of time to let the rest of the system
* catch up. If we return early, someone sent a kill signal to
* the calling process.
*/
relax = msecs_to_jiffies(iscan->iget_retry_delay);
trace_xchk_iscan_iget_retry_wait(iscan);
if (schedule_timeout_killable(relax) ||
xchk_iscan_aborted(iscan))
return -ECANCELED;
}
iscan->cursor_ino--;
return -EAGAIN;
}
/*
* Grab an inode as part of an inode scan. While scanning this inode, the
* caller must ensure that no other threads can modify the inode until a call
* to xchk_iscan_visit succeeds.
*
* Returns the number of incore inodes grabbed; -EAGAIN if the caller should
* call again xchk_iscan_advance; -EBUSY if we couldn't grab an inode;
* -ECANCELED if there's a fatal signal pending; or some other negative errno.
*/
STATIC int
xchk_iscan_iget(
struct xchk_iscan *iscan,
struct xfs_perag *pag,
struct xfs_buf *agi_bp,
xfs_inofree_t allocmask,
uint8_t nr_inodes)
{
struct xfs_scrub *sc = iscan->sc;
struct xfs_mount *mp = sc->mp;
xfs_ino_t ino = iscan->cursor_ino;
unsigned int idx = 0;
unsigned int i;
int error;
ASSERT(iscan->__inodes[0] == NULL);
/* Fill the first slot in the inode array. */
error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0,
&iscan->__inodes[idx]);
trace_xchk_iscan_iget(iscan, error);
if (error == -ENOENT || error == -EAGAIN) {
xfs_trans_brelse(sc->tp, agi_bp);
xfs_perag_put(pag);
/*
* It's possible that this inode has lost all of its links but
* hasn't yet been inactivated. If we don't have a transaction
* or it's not writable, flush the inodegc workers and wait.
*/
xfs_inodegc_flush(mp);
return xchk_iscan_iget_retry(iscan, true);
}
if (error == -EINVAL) {
xfs_trans_brelse(sc->tp, agi_bp);
xfs_perag_put(pag);
/*
* We thought the inode was allocated, but the inode btree
* lookup failed, which means that it was freed since the last
* time we advanced the cursor. Back up and try again. This
* should never happen since still hold the AGI buffer from the
* inobt check, but we need to be careful about infinite loops.
*/
return xchk_iscan_iget_retry(iscan, false);
}
if (error) {
xfs_trans_brelse(sc->tp, agi_bp);
xfs_perag_put(pag);
return error;
}
idx++;
ino++;
allocmask >>= 1;
/*
* Now that we've filled the first slot in __inodes, try to fill the
* rest of the batch with consecutively ordered inodes. to reduce the
* number of _iter calls. Make a bitmap of unallocated inodes from the
* zeroes in the inuse bitmap; these inodes will not be scanned, but
* the _want_live_update predicate will pass through all live updates.
*
* If we can't iget an allocated inode, stop and return what we have.
*/
mutex_lock(&iscan->lock);
iscan->__batch_ino = ino - 1;
iscan->__skipped_inomask = 0;
mutex_unlock(&iscan->lock);
for (i = 1; i < nr_inodes; i++, ino++, allocmask >>= 1) {
if (!(allocmask & 1)) {
ASSERT(!(iscan->__skipped_inomask & (1ULL << i)));
mutex_lock(&iscan->lock);
iscan->cursor_ino = ino;
iscan->__skipped_inomask |= (1ULL << i);
mutex_unlock(&iscan->lock);
continue;
}
ASSERT(iscan->__inodes[idx] == NULL);
error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0,
&iscan->__inodes[idx]);
if (error)
break;
mutex_lock(&iscan->lock);
iscan->cursor_ino = ino;
mutex_unlock(&iscan->lock);
idx++;
}
trace_xchk_iscan_iget_batch(sc->mp, iscan, nr_inodes, idx);
xfs_trans_brelse(sc->tp, agi_bp);
xfs_perag_put(pag);
return idx;
}
/*
* Advance the visit cursor to reflect skipped inodes beyond whatever we
* scanned.
*/
STATIC void
xchk_iscan_finish_batch(
struct xchk_iscan *iscan)
{
xfs_ino_t highest_skipped;
mutex_lock(&iscan->lock);
if (iscan->__batch_ino != NULLFSINO) {
highest_skipped = iscan->__batch_ino +
xfs_highbit64(iscan->__skipped_inomask);
iscan->__visited_ino = max(iscan->__visited_ino,
highest_skipped);
trace_xchk_iscan_skip(iscan);
}
iscan->__batch_ino = NULLFSINO;
iscan->__skipped_inomask = 0;
mutex_unlock(&iscan->lock);
}
/*
* Advance the inode scan cursor to the next allocated inode and return up to
* 64 consecutive allocated inodes starting with the cursor position.
*/
STATIC int
xchk_iscan_iter_batch(
struct xchk_iscan *iscan)
{
struct xfs_scrub *sc = iscan->sc;
int ret;
xchk_iscan_finish_batch(iscan);
if (iscan->iget_timeout)
iscan->__iget_deadline = jiffies +
msecs_to_jiffies(iscan->iget_timeout);
do {
struct xfs_buf *agi_bp = NULL;
struct xfs_perag *pag = NULL;
xfs_inofree_t allocmask = 0;
uint8_t nr_inodes = 0;
ret = xchk_iscan_advance(iscan, &pag, &agi_bp, &allocmask,
&nr_inodes);
if (ret != 1)
return ret;
if (xchk_iscan_aborted(iscan)) {
xfs_trans_brelse(sc->tp, agi_bp);
xfs_perag_put(pag);
ret = -ECANCELED;
break;
}
ret = xchk_iscan_iget(iscan, pag, agi_bp, allocmask, nr_inodes);
} while (ret == -EAGAIN);
return ret;
}
/*
* Advance the inode scan cursor to the next allocated inode and return the
* incore inode structure associated with it.
*
* Returns 1 if there's a new inode to examine, 0 if we've run out of inodes,
* -ECANCELED if the live scan aborted, -EBUSY if the incore inode could not be
* grabbed, or the usual negative errno.
*
* If the function returns -EBUSY and the caller can handle skipping an inode,
* it may call this function again to continue the scan with the next allocated
* inode.
*/
int
xchk_iscan_iter(
struct xchk_iscan *iscan,
struct xfs_inode **ipp)
{
unsigned int i;
int error;
/* Find a cached inode, or go get another batch. */
for (i = 0; i < XFS_INODES_PER_CHUNK; i++) {
if (iscan->__inodes[i])
goto foundit;
}
error = xchk_iscan_iter_batch(iscan);
if (error <= 0)
return error;
ASSERT(iscan->__inodes[0] != NULL);
i = 0;
foundit:
/* Give the caller our reference. */
*ipp = iscan->__inodes[i];
iscan->__inodes[i] = NULL;
return 1;
}
/* Clean up an xfs_iscan_iter call by dropping any inodes that we still hold. */
void
xchk_iscan_iter_finish(
struct xchk_iscan *iscan)
{
struct xfs_scrub *sc = iscan->sc;
unsigned int i;
for (i = 0; i < XFS_INODES_PER_CHUNK; i++) {
if (iscan->__inodes[i]) {
xchk_irele(sc, iscan->__inodes[i]);
iscan->__inodes[i] = NULL;
}
}
}
/* Mark this inode scan finished and release resources. */
void
xchk_iscan_teardown(
struct xchk_iscan *iscan)
{
xchk_iscan_iter_finish(iscan);
xchk_iscan_finish(iscan);
mutex_destroy(&iscan->lock);
}
/* Pick an AG from which to start a scan. */
static inline xfs_ino_t
xchk_iscan_rotor(
struct xfs_mount *mp)
{
static atomic_t agi_rotor;
unsigned int r = atomic_inc_return(&agi_rotor) - 1;
/*
* Rotoring *backwards* through the AGs, so we add one here before
* subtracting from the agcount to arrive at an AG number.
*/
r = (r % mp->m_sb.sb_agcount) + 1;
return XFS_AGINO_TO_INO(mp, mp->m_sb.sb_agcount - r, 0);
}
/*
* Set ourselves up to start an inode scan. If the @iget_timeout and
* @iget_retry_delay parameters are set, the scan will try to iget each inode
* for @iget_timeout milliseconds. If an iget call indicates that the inode is
* waiting to be inactivated, the CPU will relax for @iget_retry_delay
* milliseconds after pushing the inactivation workers.
*/
void
xchk_iscan_start(
struct xfs_scrub *sc,
unsigned int iget_timeout,
unsigned int iget_retry_delay,
struct xchk_iscan *iscan)
{
xfs_ino_t start_ino;
start_ino = xchk_iscan_rotor(sc->mp);
iscan->__batch_ino = NULLFSINO;
iscan->__skipped_inomask = 0;
iscan->sc = sc;
clear_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate);
iscan->iget_timeout = iget_timeout;
iscan->iget_retry_delay = iget_retry_delay;
iscan->__visited_ino = start_ino;
iscan->cursor_ino = start_ino;
iscan->scan_start_ino = start_ino;
mutex_init(&iscan->lock);
memset(iscan->__inodes, 0, sizeof(iscan->__inodes));
trace_xchk_iscan_start(iscan, start_ino);
}
/*
* Mark this inode as having been visited. Callers must hold a sufficiently
* exclusive lock on the inode to prevent concurrent modifications.
*/
void
xchk_iscan_mark_visited(
struct xchk_iscan *iscan,
struct xfs_inode *ip)
{
mutex_lock(&iscan->lock);
iscan->__visited_ino = ip->i_ino;
trace_xchk_iscan_visit(iscan);
mutex_unlock(&iscan->lock);
}
/*
* Did we skip this inode because it wasn't allocated when we loaded the batch?
* If so, it is newly allocated and will not be scanned. All live updates to
* this inode must be passed to the caller to maintain scan correctness.
*/
static inline bool
xchk_iscan_skipped(
const struct xchk_iscan *iscan,
xfs_ino_t ino)
{
if (iscan->__batch_ino == NULLFSINO)
return false;
if (ino < iscan->__batch_ino)
return false;
if (ino >= iscan->__batch_ino + XFS_INODES_PER_CHUNK)
return false;
return iscan->__skipped_inomask & (1ULL << (ino - iscan->__batch_ino));
}
/*
* Do we need a live update for this inode? This is true if the scanner thread
* has visited this inode and the scan hasn't been aborted due to errors.
* Callers must hold a sufficiently exclusive lock on the inode to prevent
* scanners from reading any inode metadata.
*/
bool
xchk_iscan_want_live_update(
struct xchk_iscan *iscan,
xfs_ino_t ino)
{
bool ret = false;
if (xchk_iscan_aborted(iscan))
return false;
mutex_lock(&iscan->lock);
trace_xchk_iscan_want_live_update(iscan, ino);
/* Scan is finished, caller should receive all updates. */
if (iscan->__visited_ino == NULLFSINO) {
ret = true;
goto unlock;
}
/*
* No inodes have been visited yet, so the visited cursor points at the
* start of the scan range. The caller should not receive any updates.
*/
if (iscan->scan_start_ino == iscan->__visited_ino) {
ret = false;
goto unlock;
}
/*
* This inode was not allocated at the time of the iscan batch.
* The caller should receive all updates.
*/
if (xchk_iscan_skipped(iscan, ino)) {
ret = true;
goto unlock;
}
/*
* The visited cursor hasn't yet wrapped around the end of the FS. If
* @ino is inside the starred range, the caller should receive updates:
*
* 0 ------------ S ************ V ------------ EOFS
*/
if (iscan->scan_start_ino <= iscan->__visited_ino) {
if (ino >= iscan->scan_start_ino &&
ino <= iscan->__visited_ino)
ret = true;
goto unlock;
}
/*
* The visited cursor wrapped around the end of the FS. If @ino is
* inside the starred range, the caller should receive updates:
*
* 0 ************ V ------------ S ************ EOFS
*/
if (ino >= iscan->scan_start_ino || ino <= iscan->__visited_ino)
ret = true;
unlock:
mutex_unlock(&iscan->lock);
return ret;
}