Merge branches 'cmpxchg.2024.05.11a', 'kcsan.2024.05.07a', 'lkmm.2024.05.06a', 'rcu-merge.2024.05.01a' and 'tsc.2024.04.09c' into HEAD

cmpxchg.2024.05.11a: Single-byte cmpxchg() emulation and use in csky.
kcsan.2024.05.07a: Add __data_racy type qualifier.
lkmm.2024.05.06a: Linux kernel memory model updates.
rcu-merge.2024.05.01a: RCU commits (via Uladzislau Rezki).
tsc.2024.04.09c: TSC watchdog updates (likely via -tip).
diff --git a/Documentation/atomic_t.txt b/Documentation/atomic_t.txt
index d7adc6d..bee3b1b 100644
--- a/Documentation/atomic_t.txt
+++ b/Documentation/atomic_t.txt
@@ -171,14 +171,14 @@
  - RMW operations that are conditional are unordered on FAILURE,
    otherwise the above rules apply.
 
-Except of course when an operation has an explicit ordering like:
+Except of course when a successful operation has an explicit ordering like:
 
  {}_relaxed: unordered
  {}_acquire: the R of the RMW (or atomic_read) is an ACQUIRE
  {}_release: the W of the RMW (or atomic_set)  is a  RELEASE
 
 Where 'unordered' is against other memory locations. Address dependencies are
-not defeated.
+not defeated.  Conditional operations are still unordered on FAILURE.
 
 Fully ordered primitives are ordered against everything prior and everything
 subsequent. Therefore a fully ordered primitive is like having an smp_mb()
diff --git a/Documentation/dev-tools/kcsan.rst b/Documentation/dev-tools/kcsan.rst
index 94b6802..02143f0 100644
--- a/Documentation/dev-tools/kcsan.rst
+++ b/Documentation/dev-tools/kcsan.rst
@@ -91,6 +91,16 @@
   behaviour when encountering a data race is deemed safe.  Please see
   `"Marking Shared-Memory Accesses" in the LKMM`_ for more information.
 
+* Similar to ``data_race(...)``, the type qualifier ``__data_racy`` can be used
+  to document that all data races due to accesses to a variable are intended
+  and should be ignored by KCSAN::
+
+    struct foo {
+        ...
+        int __data_racy stats_counter;
+        ...
+    };
+
 * Disabling data race detection for entire functions can be accomplished by
   using the function attribute ``__no_kcsan``::
 
diff --git a/Documentation/litmus-tests/README b/Documentation/litmus-tests/README
index 658d378..6c666f3 100644
--- a/Documentation/litmus-tests/README
+++ b/Documentation/litmus-tests/README
@@ -21,6 +21,51 @@
     Test that atomic_set() cannot break the atomicity of atomic RMWs.
     NOTE: Require herd7 7.56 or later which supports "(void)expr".
 
+cmpxchg-fail-ordered-1.litmus
+    Demonstrate that a failing cmpxchg() operation acts as a full barrier
+    when followed by smp_mb__after_atomic().
+
+cmpxchg-fail-ordered-2.litmus
+    Demonstrate that a failing cmpxchg() operation acts as an acquire
+    operation when followed by smp_mb__after_atomic().
+
+cmpxchg-fail-unordered-1.litmus
+    Demonstrate that a failing cmpxchg() operation does not act as a
+    full barrier.
+
+cmpxchg-fail-unordered-2.litmus
+    Demonstrate that a failing cmpxchg() operation does not act as an
+    acquire operation.
+
+
+locking (/locking directory)
+----------------------------
+
+DCL-broken.litmus
+    Demonstrates that double-checked locking needs more than just
+    the obvious lock acquisitions and releases.
+
+DCL-fixed.litmus
+    Demonstrates corrected double-checked locking that uses
+    smp_store_release() and smp_load_acquire() in addition to the
+    obvious lock acquisitions and releases.
+
+RM-broken.litmus
+    Demonstrates problems with "roach motel" locking, where code is
+    freely moved into lock-based critical sections.  This example also
+    shows how to use the "filter" clause to discard executions that
+    would be excluded by other code not modeled in the litmus test.
+    Note also that this "roach motel" optimization is emulated by
+    physically moving P1()'s two reads from x under the lock.
+
+    What is a roach motel?  This is from an old advertisement for
+    a cockroach trap, much later featured in one of the "Men in
+    Black" movies.  "The roaches check in.  They don't check out."
+
+RM-fixed.litmus
+    The counterpart to RM-broken.litmus, showing P0()'s two loads from
+    x safely outside of the critical section.
+
 
 RCU (/rcu directory)
 --------------------
diff --git a/Documentation/litmus-tests/atomic/cmpxchg-fail-ordered-1.litmus b/Documentation/litmus-tests/atomic/cmpxchg-fail-ordered-1.litmus
new file mode 100644
index 0000000..c0f93dc
--- /dev/null
+++ b/Documentation/litmus-tests/atomic/cmpxchg-fail-ordered-1.litmus
@@ -0,0 +1,35 @@
+C cmpxchg-fail-ordered-1
+
+(*
+ * Result: Never
+ *
+ * Demonstrate that a failing cmpxchg() operation will act as a full
+ * barrier when followed by smp_mb__after_atomic().
+ *)
+
+{}
+
+P0(int *x, int *y, int *z)
+{
+	int r0;
+	int r1;
+
+	WRITE_ONCE(*x, 1);
+	r1 = cmpxchg(z, 1, 0);
+	smp_mb__after_atomic();
+	r0 = READ_ONCE(*y);
+}
+
+P1(int *x, int *y, int *z)
+{
+	int r0;
+	int r1;
+
+	WRITE_ONCE(*y, 1);
+	r1 = cmpxchg(z, 1, 0);
+	smp_mb__after_atomic();
+	r0 = READ_ONCE(*x);
+}
+
+locations[0:r1;1:r1]
+exists (0:r0=0 /\ 1:r0=0)
diff --git a/Documentation/litmus-tests/atomic/cmpxchg-fail-ordered-2.litmus b/Documentation/litmus-tests/atomic/cmpxchg-fail-ordered-2.litmus
new file mode 100644
index 0000000..5c06054
--- /dev/null
+++ b/Documentation/litmus-tests/atomic/cmpxchg-fail-ordered-2.litmus
@@ -0,0 +1,30 @@
+C cmpxchg-fail-ordered-2
+
+(*
+ * Result: Never
+ *
+ * Demonstrate use of smp_mb__after_atomic() to make a failing cmpxchg
+ * operation have acquire ordering.
+ *)
+
+{}
+
+P0(int *x, int *y)
+{
+	int r1;
+
+	WRITE_ONCE(*x, 1);
+	r1 = cmpxchg(y, 0, 1);
+}
+
+P1(int *x, int *y)
+{
+	int r1;
+	int r2;
+
+	r1 = cmpxchg(y, 0, 1);
+	smp_mb__after_atomic();
+	r2 = READ_ONCE(*x);
+}
+
+exists (0:r1=0 /\ 1:r1=1 /\ 1:r2=0)
diff --git a/Documentation/litmus-tests/atomic/cmpxchg-fail-unordered-1.litmus b/Documentation/litmus-tests/atomic/cmpxchg-fail-unordered-1.litmus
new file mode 100644
index 0000000..39ea1f5
--- /dev/null
+++ b/Documentation/litmus-tests/atomic/cmpxchg-fail-unordered-1.litmus
@@ -0,0 +1,34 @@
+C cmpxchg-fail-unordered-1
+
+(*
+ * Result: Sometimes
+ *
+ * Demonstrate that a failing cmpxchg() operation does not act as a
+ * full barrier.  (In contrast, a successful cmpxchg() does act as a
+ * full barrier.)
+ *)
+
+{}
+
+P0(int *x, int *y, int *z)
+{
+	int r0;
+	int r1;
+
+	WRITE_ONCE(*x, 1);
+	r1 = cmpxchg(z, 1, 0);
+	r0 = READ_ONCE(*y);
+}
+
+P1(int *x, int *y, int *z)
+{
+	int r0;
+	int r1;
+
+	WRITE_ONCE(*y, 1);
+	r1 = cmpxchg(z, 1, 0);
+	r0 = READ_ONCE(*x);
+}
+
+locations[0:r1;1:r1]
+exists (0:r0=0 /\ 1:r0=0)
diff --git a/Documentation/litmus-tests/atomic/cmpxchg-fail-unordered-2.litmus b/Documentation/litmus-tests/atomic/cmpxchg-fail-unordered-2.litmus
new file mode 100644
index 0000000..61aab24
--- /dev/null
+++ b/Documentation/litmus-tests/atomic/cmpxchg-fail-unordered-2.litmus
@@ -0,0 +1,30 @@
+C cmpxchg-fail-unordered-2
+
+(*
+ * Result: Sometimes
+ *
+ * Demonstrate that a failing cmpxchg() operation does not act as either
+ * an acquire release operation.  (In contrast, a successful cmpxchg()
+ * does act as both an acquire and a release operation.)
+ *)
+
+{}
+
+P0(int *x, int *y)
+{
+	int r1;
+
+	WRITE_ONCE(*x, 1);
+	r1 = cmpxchg(y, 0, 1);
+}
+
+P1(int *x, int *y)
+{
+	int r1;
+	int r2;
+
+	r1 = cmpxchg(y, 0, 1);
+	r2 = READ_ONCE(*x);
+}
+
+exists (0:r1=0 /\ 1:r1=1 /\ 1:r2=0)
diff --git a/arch/Kconfig b/arch/Kconfig
index ae4a4f3..93d5010 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1609,4 +1609,7 @@
 	# strict alignment always, even with -falign-functions.
 	def_bool CC_HAS_MIN_FUNCTION_ALIGNMENT || CC_IS_CLANG
 
+config ARCH_NEED_CMPXCHG_1_EMU
+	bool
+
 endmenu
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index d3ac367..5479707 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -37,6 +37,7 @@
 	select ARCH_INLINE_SPIN_UNLOCK_BH if !PREEMPTION
 	select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPTION
 	select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPTION
+	select ARCH_NEED_CMPXCHG_1_EMU
 	select ARCH_WANT_FRAME_POINTERS if !CPU_CK610 && $(cc-option,-mbacktrace)
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
 	select COMMON_CLK
diff --git a/arch/csky/include/asm/cmpxchg.h b/arch/csky/include/asm/cmpxchg.h
index 916043b..db6dda4 100644
--- a/arch/csky/include/asm/cmpxchg.h
+++ b/arch/csky/include/asm/cmpxchg.h
@@ -6,6 +6,7 @@
 #ifdef CONFIG_SMP
 #include <linux/bug.h>
 #include <asm/barrier.h>
+#include <linux/cmpxchg-emu.h>
 
 #define __xchg_relaxed(new, ptr, size)				\
 ({								\
@@ -61,6 +62,9 @@
 	__typeof__(old) __old = (old);				\
 	__typeof__(*(ptr)) __ret;				\
 	switch (size) {						\
+	case 1:							\
+		__ret = (__typeof__(*(ptr)))cmpxchg_emu_u8((volatile u8 *)__ptr, (uintptr_t)__old, (uintptr_t)__new); \
+		break;						\
 	case 4:							\
 		asm volatile (					\
 		"1:	ldex.w		%0, (%3) \n"		\
@@ -91,6 +95,9 @@
 	__typeof__(old) __old = (old);				\
 	__typeof__(*(ptr)) __ret;				\
 	switch (size) {						\
+	case 1:							\
+		__ret = (__typeof__(*(ptr)))cmpxchg_emu_u8((volatile u8 *)__ptr, (uintptr_t)__old, (uintptr_t)__new); \
+		break;						\
 	case 4:							\
 		asm volatile (					\
 		"1:	ldex.w		%0, (%3) \n"		\
@@ -122,6 +129,9 @@
 	__typeof__(old) __old = (old);				\
 	__typeof__(*(ptr)) __ret;				\
 	switch (size) {						\
+	case 1:							\
+		__ret = (__typeof__(*(ptr)))cmpxchg_emu_u8((volatile u8 *)__ptr, (uintptr_t)__old, (uintptr_t)__new); \
+		break;						\
 	case 4:							\
 		asm volatile (					\
 		RELEASE_FENCE					\
diff --git a/arch/parisc/include/asm/cmpxchg.h b/arch/parisc/include/asm/cmpxchg.h
index c1d776b..bf0a0f1 100644
--- a/arch/parisc/include/asm/cmpxchg.h
+++ b/arch/parisc/include/asm/cmpxchg.h
@@ -56,26 +56,24 @@
 /* bug catcher for when unsupported size is used - won't link */
 extern void __cmpxchg_called_with_bad_pointer(void);
 
-/* __cmpxchg_u32/u64 defined in arch/parisc/lib/bitops.c */
-extern unsigned long __cmpxchg_u32(volatile unsigned int *m, unsigned int old,
-				   unsigned int new_);
-extern u64 __cmpxchg_u64(volatile u64 *ptr, u64 old, u64 new_);
+/* __cmpxchg_u... defined in arch/parisc/lib/bitops.c */
 extern u8 __cmpxchg_u8(volatile u8 *ptr, u8 old, u8 new_);
+extern u16 __cmpxchg_u16(volatile u16 *ptr, u16 old, u16 new_);
+extern u32 __cmpxchg_u32(volatile u32 *m, u32 old, u32 new_);
+extern u64 __cmpxchg_u64(volatile u64 *ptr, u64 old, u64 new_);
 
 /* don't worry...optimizer will get rid of most of this */
 static inline unsigned long
 __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new_, int size)
 {
-	switch (size) {
+	return
 #ifdef CONFIG_64BIT
-	case 8: return __cmpxchg_u64((u64 *)ptr, old, new_);
+		size == 8 ? __cmpxchg_u64(ptr, old, new_) :
 #endif
-	case 4: return __cmpxchg_u32((unsigned int *)ptr,
-				     (unsigned int)old, (unsigned int)new_);
-	case 1: return __cmpxchg_u8((u8 *)ptr, old & 0xff, new_ & 0xff);
-	}
-	__cmpxchg_called_with_bad_pointer();
-	return old;
+		size == 4 ? __cmpxchg_u32(ptr, old, new_) :
+		size == 2 ? __cmpxchg_u16(ptr, old, new_) :
+		size == 1 ? __cmpxchg_u8(ptr, old, new_) :
+			(__cmpxchg_called_with_bad_pointer(), old);
 }
 
 #define arch_cmpxchg(ptr, o, n)						 \
diff --git a/arch/parisc/kernel/parisc_ksyms.c b/arch/parisc/kernel/parisc_ksyms.c
index 6f0c92e..c1587aa 100644
--- a/arch/parisc/kernel/parisc_ksyms.c
+++ b/arch/parisc/kernel/parisc_ksyms.c
@@ -22,6 +22,8 @@
 #include <linux/atomic.h>
 EXPORT_SYMBOL(__xchg8);
 EXPORT_SYMBOL(__xchg32);
+EXPORT_SYMBOL(__cmpxchg_u8);
+EXPORT_SYMBOL(__cmpxchg_u16);
 EXPORT_SYMBOL(__cmpxchg_u32);
 EXPORT_SYMBOL(__cmpxchg_u64);
 #ifdef CONFIG_SMP
diff --git a/arch/parisc/lib/bitops.c b/arch/parisc/lib/bitops.c
index 36a3141..9df8100 100644
--- a/arch/parisc/lib/bitops.c
+++ b/arch/parisc/lib/bitops.c
@@ -56,38 +56,20 @@
 }
 
 
-u64 notrace __cmpxchg_u64(volatile u64 *ptr, u64 old, u64 new)
-{
-	unsigned long flags;
-	u64 prev;
+#define CMPXCHG(T)						\
+	T notrace __cmpxchg_##T(volatile T *ptr, T old, T new)	\
+	{							\
+		unsigned long flags;				\
+		T prev;						\
+								\
+		_atomic_spin_lock_irqsave(ptr, flags);		\
+		if ((prev = *ptr) == old)			\
+			*ptr = new;				\
+		_atomic_spin_unlock_irqrestore(ptr, flags);	\
+		return prev;					\
+	}
 
-	_atomic_spin_lock_irqsave(ptr, flags);
-	if ((prev = *ptr) == old)
-		*ptr = new;
-	_atomic_spin_unlock_irqrestore(ptr, flags);
-	return prev;
-}
-
-unsigned long notrace __cmpxchg_u32(volatile unsigned int *ptr, unsigned int old, unsigned int new)
-{
-	unsigned long flags;
-	unsigned int prev;
-
-	_atomic_spin_lock_irqsave(ptr, flags);
-	if ((prev = *ptr) == old)
-		*ptr = new;
-	_atomic_spin_unlock_irqrestore(ptr, flags);
-	return (unsigned long)prev;
-}
-
-u8 notrace __cmpxchg_u8(volatile u8 *ptr, u8 old, u8 new)
-{
-	unsigned long flags;
-	u8 prev;
-
-	_atomic_spin_lock_irqsave(ptr, flags);
-	if ((prev = *ptr) == old)
-		*ptr = new;
-	_atomic_spin_unlock_irqrestore(ptr, flags);
-	return prev;
-}
+CMPXCHG(u64)
+CMPXCHG(u32)
+CMPXCHG(u16)
+CMPXCHG(u8)
diff --git a/arch/sparc/include/asm/cmpxchg_32.h b/arch/sparc/include/asm/cmpxchg_32.h
index d0af82c..8c1a3ca 100644
--- a/arch/sparc/include/asm/cmpxchg_32.h
+++ b/arch/sparc/include/asm/cmpxchg_32.h
@@ -38,21 +38,19 @@
 
 /* bug catcher for when unsupported size is used - won't link */
 void __cmpxchg_called_with_bad_pointer(void);
-/* we only need to support cmpxchg of a u32 on sparc */
-unsigned long __cmpxchg_u32(volatile u32 *m, u32 old, u32 new_);
+u8 __cmpxchg_u8(volatile u8 *m, u8 old, u8 new_);
+u16 __cmpxchg_u16(volatile u16 *m, u16 old, u16 new_);
+u32 __cmpxchg_u32(volatile u32 *m, u32 old, u32 new_);
 
 /* don't worry...optimizer will get rid of most of this */
 static inline unsigned long
 __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new_, int size)
 {
-	switch (size) {
-	case 4:
-		return __cmpxchg_u32((u32 *)ptr, (u32)old, (u32)new_);
-	default:
-		__cmpxchg_called_with_bad_pointer();
-		break;
-	}
-	return old;
+	return
+		size == 1 ? __cmpxchg_u8(ptr, old, new_) :
+		size == 2 ? __cmpxchg_u16(ptr, old, new_) :
+		size == 4 ? __cmpxchg_u32(ptr, old, new_) :
+			(__cmpxchg_called_with_bad_pointer(), old);
 }
 
 #define arch_cmpxchg(ptr, o, n)						\
@@ -63,7 +61,7 @@
 			(unsigned long)_n_, sizeof(*(ptr)));		\
 })
 
-u64 __cmpxchg_u64(u64 *ptr, u64 old, u64 new);
+u64 __cmpxchg_u64(volatile u64 *ptr, u64 old, u64 new);
 #define arch_cmpxchg64(ptr, old, new)	__cmpxchg_u64(ptr, old, new)
 
 #include <asm-generic/cmpxchg-local.h>
diff --git a/arch/sparc/lib/atomic32.c b/arch/sparc/lib/atomic32.c
index cf80d1a..8ae880e 100644
--- a/arch/sparc/lib/atomic32.c
+++ b/arch/sparc/lib/atomic32.c
@@ -159,32 +159,27 @@
 }
 EXPORT_SYMBOL(sp32___change_bit);
 
-unsigned long __cmpxchg_u32(volatile u32 *ptr, u32 old, u32 new)
-{
-	unsigned long flags;
-	u32 prev;
+#define CMPXCHG(T)						\
+	T __cmpxchg_##T(volatile T *ptr, T old, T new)		\
+	{							\
+		unsigned long flags;				\
+		T prev;						\
+								\
+		spin_lock_irqsave(ATOMIC_HASH(ptr), flags);	\
+		if ((prev = *ptr) == old)			\
+			*ptr = new;				\
+		spin_unlock_irqrestore(ATOMIC_HASH(ptr), flags);\
+								\
+		return prev;					\
+	}
 
-	spin_lock_irqsave(ATOMIC_HASH(ptr), flags);
-	if ((prev = *ptr) == old)
-		*ptr = new;
-	spin_unlock_irqrestore(ATOMIC_HASH(ptr), flags);
-
-	return (unsigned long)prev;
-}
+CMPXCHG(u8)
+CMPXCHG(u16)
+CMPXCHG(u32)
+CMPXCHG(u64)
+EXPORT_SYMBOL(__cmpxchg_u8);
+EXPORT_SYMBOL(__cmpxchg_u16);
 EXPORT_SYMBOL(__cmpxchg_u32);
-
-u64 __cmpxchg_u64(u64 *ptr, u64 old, u64 new)
-{
-	unsigned long flags;
-	u64 prev;
-
-	spin_lock_irqsave(ATOMIC_HASH(ptr), flags);
-	if ((prev = *ptr) == old)
-		*ptr = new;
-	spin_unlock_irqrestore(ATOMIC_HASH(ptr), flags);
-
-	return prev;
-}
 EXPORT_SYMBOL(__cmpxchg_u64);
 
 unsigned long __xchg_u32(volatile u32 *ptr, u32 new)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 5a69a49..0e7f44c 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1289,7 +1289,7 @@
 	 */
 	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
 		/* assume multi socket systems are not synchronized: */
-		if (num_possible_cpus() > 1)
+		if (topology_max_packages() > 1)
 			return 1;
 	}
 
diff --git a/include/linux/cmpxchg-emu.h b/include/linux/cmpxchg-emu.h
new file mode 100644
index 0000000..998deec
--- /dev/null
+++ b/include/linux/cmpxchg-emu.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Emulated 1-byte and 2-byte cmpxchg operations for architectures
+ * lacking direct support for these sizes.  These are implemented in terms
+ * of 4-byte cmpxchg operations.
+ *
+ * Copyright (C) 2024 Paul E. McKenney.
+ */
+
+#ifndef __LINUX_CMPXCHG_EMU_H
+#define __LINUX_CMPXCHG_EMU_H
+
+uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new);
+
+#endif /* __LINUX_CMPXCHG_EMU_H */
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 2abaa3a..a38162a 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -273,9 +273,16 @@
  * disable all instrumentation. See Kconfig.kcsan where this is mandatory.
  */
 # define __no_kcsan __no_sanitize_thread __disable_sanitizer_instrumentation
+/*
+ * Type qualifier to mark variables where all data-racy accesses should be
+ * ignored by KCSAN. Note, the implementation simply marks these variables as
+ * volatile, since KCSAN will treat such accesses as "marked".
+ */
+# define __data_racy volatile
 # define __no_sanitize_or_inline __no_kcsan notrace __maybe_unused
 #else
 # define __no_kcsan
+# define __data_racy
 #endif
 
 #ifndef __no_sanitize_or_inline
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index 0155862..0c17b4c 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -304,6 +304,7 @@
 static struct {
 	long val[8];
 } test_struct;
+static long __data_racy test_data_racy;
 static DEFINE_SEQLOCK(test_seqlock);
 static DEFINE_SPINLOCK(test_spinlock);
 static DEFINE_MUTEX(test_mutex);
@@ -358,6 +359,8 @@
 
 static noinline void test_kernel_data_race(void) { data_race(test_var++); }
 
+static noinline void test_kernel_data_racy_qualifier(void) { test_data_racy++; }
+
 static noinline void test_kernel_assert_writer(void)
 {
 	ASSERT_EXCLUSIVE_WRITER(test_var);
@@ -1009,6 +1012,19 @@
 	KUNIT_EXPECT_FALSE(test, match_never);
 }
 
+/* Test the __data_racy type qualifier. */
+__no_kcsan
+static void test_data_racy_qualifier(struct kunit *test)
+{
+	bool match_never = false;
+
+	begin_test_checks(test_kernel_data_racy_qualifier, test_kernel_data_racy_qualifier);
+	do {
+		match_never = report_available();
+	} while (!end_test_checks(match_never));
+	KUNIT_EXPECT_FALSE(test, match_never);
+}
+
 __no_kcsan
 static void test_assert_exclusive_writer(struct kunit *test)
 {
@@ -1424,6 +1440,7 @@
 	KCSAN_KUNIT_CASE(test_read_plain_atomic_rmw),
 	KCSAN_KUNIT_CASE(test_zero_size_access),
 	KCSAN_KUNIT_CASE(test_data_race),
+	KCSAN_KUNIT_CASE(test_data_racy_qualifier),
 	KCSAN_KUNIT_CASE(test_assert_exclusive_writer),
 	KCSAN_KUNIT_CASE(test_assert_exclusive_access),
 	KCSAN_KUNIT_CASE(test_assert_exclusive_access_writer),
diff --git a/lib/Makefile b/lib/Makefile
index ffc6b23..cc3d52f 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -236,6 +236,7 @@
 lib-$(CONFIG_GENERIC_BUG) += bug.o
 
 obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o
+obj-$(CONFIG_ARCH_NEED_CMPXCHG_1_EMU) += cmpxchg-emu.o
 
 obj-$(CONFIG_DYNAMIC_DEBUG_CORE) += dynamic_debug.o
 #ensure exported functions have prototypes
diff --git a/lib/cmpxchg-emu.c b/lib/cmpxchg-emu.c
new file mode 100644
index 0000000..27f6f97
--- /dev/null
+++ b/lib/cmpxchg-emu.c
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Emulated 1-byte cmpxchg operation for architectures lacking direct
+ * support for this size.  This is implemented in terms of 4-byte cmpxchg
+ * operations.
+ *
+ * Copyright (C) 2024 Paul E. McKenney.
+ */
+
+#include <linux/types.h>
+#include <linux/export.h>
+#include <linux/instrumented.h>
+#include <linux/atomic.h>
+#include <linux/panic.h>
+#include <linux/bug.h>
+#include <asm-generic/rwonce.h>
+#include <linux/cmpxchg-emu.h>
+
+union u8_32 {
+	u8 b[4];
+	u32 w;
+};
+
+/* Emulate one-byte cmpxchg() in terms of 4-byte cmpxchg. */
+uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new)
+{
+	u32 *p32 = (u32 *)(((uintptr_t)p) & ~0x3);
+	int i = ((uintptr_t)p) & 0x3;
+	union u8_32 old32;
+	union u8_32 new32;
+	u32 ret;
+
+	ret = READ_ONCE(*p32);
+	do {
+		old32.w = ret;
+		if (old32.b[i] != old)
+			return old32.b[i];
+		new32.w = old32.w;
+		new32.b[i] = new;
+		instrument_atomic_read_write(p, 1);
+		ret = data_race(cmpxchg(p32, old32.w, new32.w)); // Overridden above.
+	} while (ret != old32.w);
+	return old;
+}
+EXPORT_SYMBOL_GPL(cmpxchg_emu_u8);