smp: Make CSD lock acquisition atomic for debug mode

Commit b0473dcd4b1d ("smp: Improve smp_call_function_single()
CSD-lock diagnostics") changed smp_call_function_single() so that,
when CSD lock debugging is enabled, async !wait calls use the
destination CPU csd_data. That improves diagnostics, but it also removes
the single-writer property that made the old csd_lock() safe: multiple
CPUs can now prepare the same destination CPU CSD concurrently.

csd_lock() currently waits for CSD_FLAG_LOCK to clear and then sets the
bit with a non-atomic read-modify-write. Two senders can both see an
unlocked CSD, set the bit, overwrite the callback fields, and enqueue
the same llist node. Re-adding a node that is already the queue head can
make node->next point to itself, leaving the target CPU stuck walking
call_single_queue. Later synchronous work, such as a TLB shootdown, can
then remain queued and trigger soft-lockup warnings or panics.

Keep the single csd_lock() implementation, but when CSD lock debugging is
enabled, acquire CSD_FLAG_LOCK with try_cmpxchg_acquire(). This makes the
destination CPU CSD a real atomic lock in the only configuration where it
can be shared by multiple remote senders, while preserving the existing
non-debug fast path.

Fixes: b0473dcd4b1d ("smp: Improve smp_call_function_single() CSD-lock diagnostics")
Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
diff --git a/kernel/smp.c b/kernel/smp.c
index dc6582b..b9448fe 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -137,10 +137,10 @@ csd_do_func(smp_call_func_t func, void *info, call_single_data_t *csd)
 	trace_csd_function_exit(func, csd);
 }
 
-#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
-
 static DEFINE_STATIC_KEY_MAYBE(CONFIG_CSD_LOCK_WAIT_DEBUG_DEFAULT, csdlock_debug_enabled);
 
+#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
+
 /*
  * Parse the csdlock_debug= kernel boot parameter.
  *
@@ -342,6 +342,10 @@ static __always_inline void csd_lock_wait(call_single_data_t *csd)
 	smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
 }
 #else
+static __always_inline void __csd_lock_wait(call_single_data_t *csd)
+{
+}
+
 static void csd_lock_record(call_single_data_t *csd)
 {
 }
@@ -354,8 +358,22 @@ static __always_inline void csd_lock_wait(call_single_data_t *csd)
 
 static __always_inline void csd_lock(call_single_data_t *csd)
 {
-	csd_lock_wait(csd);
-	csd->node.u_flags |= CSD_FLAG_LOCK;
+	if (IS_ENABLED(CONFIG_CSD_LOCK_WAIT_DEBUG) &&
+	    static_branch_unlikely(&csdlock_debug_enabled)) {
+		unsigned int flags;
+
+		for (;;) {
+			__csd_lock_wait(csd);
+			flags = READ_ONCE(csd->node.u_flags);
+			if (!(flags & CSD_FLAG_LOCK) &&
+			    try_cmpxchg_acquire(&csd->node.u_flags, &flags,
+						flags | CSD_FLAG_LOCK))
+				break;
+		}
+	} else {
+		csd_lock_wait(csd);
+		csd->node.u_flags |= CSD_FLAG_LOCK;
+	}
 
 	/*
 	 * prevent CPU from reordering the above assignment