rcu: Narrow early boot window of illegal synchronous grace periods

The current preemptible RCU implementation goes through three phases
during bootup.  In the first phase, there is only one CPU that is
running with preemption disabled, so that a no-op is a synchronous
grace period.  In the second phase, the scheduler is running, but
RCU has not yet gotten its kthreads spawned (and, for expedited grace
periods, workqueues are not yet running.  During this time, any
attempt to do a synchronous grace period will hang the system (or
complain bitterly, depending).  In the third and final phase, RCU
is fully operational and everything works normally.

This has been OK for some time, but there has recently been some
synchronous grace periods showing up during the second dead-zone
phase.  This commit therefore narrows the second phase.

As soon as kthreads can be spawned, expedited grace period kthreads are
created, and normal grace periods are unconditionally mapped to expedited
grace periods.  These kthreads are stopped once RCU is fully
operational.  Expedited grace periods then go back to using workqueues,
and, if other settings permit, normal grace periods are enabled.

Reported-by: "Zheng, Lv" <lv.zheng@intel.com>
Reported-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index e3e6397..6454b8c 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -230,6 +230,10 @@ static inline void rcu_irq_exit(void)
 {
 }
 
+static inline void rcu_create_expedited_kthreads(void)
+{
+}
+
 static inline void exit_rcu(void)
 {
 }
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index fcd61cb..5e76147 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -104,6 +104,7 @@ void rcu_irq_exit(void);
 void rcu_irq_enter_irqson(void);
 void rcu_irq_exit_irqson(void);
 
+void rcu_create_expedited_kthreads(void);
 void exit_rcu(void);
 
 void rcu_scheduler_starting(void);
diff --git a/init/main.c b/init/main.c
index b0c9d6f..1ee9f7d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -385,7 +385,6 @@ static noinline void __ref rest_init(void)
 {
 	int pid;
 
-	rcu_scheduler_starting();
 	/*
 	 * We need to spawn init first so that it obtains pid 1, however
 	 * the init task will end up wanting to create kthreads, which, if
@@ -393,6 +392,7 @@ static noinline void __ref rest_init(void)
 	 */
 	kernel_thread(kernel_init, NULL, CLONE_FS);
 	numa_default_policy();
+	rcu_scheduler_starting();
 	pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
 	rcu_read_lock();
 	kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
@@ -1011,6 +1011,10 @@ static noinline void __init kernel_init_freeable(void)
 	 */
 	set_mems_allowed(node_states[N_MEMORY]);
 	/*
+	 * Get RCU synchronous grace periods back in action.
+	 */
+	rcu_create_expedited_kthreads();
+	/*
 	 * init can run on any cpu.
 	 */
 	set_cpus_allowed_ptr(current, cpu_all_mask);
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 80adef7..0d6ff3e 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -136,6 +136,7 @@ int rcu_jiffies_till_stall_check(void);
 #define TPS(x)  tracepoint_string(x)
 
 void rcu_early_boot_tests(void);
+void rcu_test_sync_prims(void);
 
 /*
  * This function really isn't for public consumption, but RCU is special in
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 5a2ad58..728f138 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -524,6 +524,11 @@ struct rcu_state {
 	atomic_t expedited_need_qs;		/* # CPUs left to check in. */
 	struct swait_queue_head expedited_wq;	/* Wait for check-ins. */
 	int ncpus_snap;				/* # CPUs seen last time. */
+	smp_call_func_t exp_ipi_handler;	/* IPI handler. */
+	struct swait_queue_head exp_boot_wq;	/* Boot-time GP wakeup. */
+	struct task_struct *exp_boot_kthread;	/* Boot-time GP kthread. */
+	unsigned long exp_boot_s;		/* Boot-time sequence number. */
+	bool exp_boot_go;			/* Boot-time "go" flag. */
 
 	unsigned long jiffies_force_qs;		/* Time at which to invoke */
 						/*  force_quiescent_state(). */
@@ -594,6 +599,8 @@ extern struct rcu_state rcu_bh_state;
 extern struct rcu_state rcu_preempt_state;
 #endif /* #ifdef CONFIG_PREEMPT_RCU */
 
+extern bool rcu_expedited_till_core;
+
 int rcu_dynticks_snap(struct rcu_dynticks *rdtp);
 bool rcu_eqs_special_set(int cpu);
 
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index a377954..d30839c 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -546,18 +546,28 @@ struct rcu_exp_work {
 };
 
 /*
+ * Common code to drive an expedited grace period forward, used by
+ * workqueues and boot-time kthreads.
+ */
+static void rcu_exp_sel_wait_wake(struct rcu_state *rsp,
+				  smp_call_func_t func, unsigned long s)
+{
+	/* Initialize the rcu_node tree in preparation for the wait. */
+	sync_rcu_exp_select_cpus(rsp, func);
+
+	/* Wait and clean up, including waking everyone. */
+	rcu_exp_wait_wake(rsp, s);
+}
+
+/*
  * Work-queue handler to drive an expedited grace period forward.
  */
 static void wait_rcu_exp_gp(struct work_struct *wp)
 {
 	struct rcu_exp_work *rewp;
 
-	/* Initialize the rcu_node tree in preparation for the wait. */
 	rewp = container_of(wp, struct rcu_exp_work, rew_work);
-	sync_rcu_exp_select_cpus(rewp->rew_rsp, rewp->rew_func);
-
-	/* Wait and clean up, including waking everyone. */
-	rcu_exp_wait_wake(rewp->rew_rsp, rewp->rew_s);
+	rcu_exp_sel_wait_wake(rewp->rew_rsp, rewp->rew_func, rewp->rew_s);
 }
 
 /*
@@ -583,12 +593,20 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp,
 	if (exp_funnel_lock(rsp, s))
 		return;  /* Someone else did our work for us. */
 
-	/* Marshall arguments and schedule the expedited grace period. */
-	rew.rew_func = func;
-	rew.rew_rsp = rsp;
-	rew.rew_s = s;
-	INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
-	schedule_work(&rew.rew_work);
+	/* Ensure that load happens before action based on it. */
+	if (unlikely(smp_load_acquire(&rcu_expedited_till_core))) { /* ^^^ */
+		rsp->exp_boot_s = s;
+		rsp->exp_boot_go = true;
+		smp_mb(); /* Above assignments visible to awakened task. */
+		swake_up(&rsp->exp_boot_wq);
+	} else {
+		/* Marshall arguments & schedule the expedited grace period. */
+		rew.rew_func = func;
+		rew.rew_rsp = rsp;
+		rew.rew_s = s;
+		INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
+		schedule_work(&rew.rew_work);
+	}
 
 	/* Wait for expedited grace period to complete. */
 	rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
@@ -705,6 +723,14 @@ void synchronize_rcu_expedited(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 
+/*
+ * Set up the IPI handler for synchronize_rcu_expedited().
+ */
+static void rcu_preempt_create_expedited_kthreads(void)
+{
+	rcu_preempt_state.exp_ipi_handler = sync_rcu_exp_handler;
+}
+
 #else /* #ifdef CONFIG_PREEMPT_RCU */
 
 /*
@@ -717,4 +743,82 @@ void synchronize_rcu_expedited(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 
+static void rcu_preempt_create_expedited_kthreads(void)
+{
+}
+
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+
+/*
+ * This kthread handles early boot expedited grace-period requests that
+ * happen after it is possible to spawn kthreads, but before the workqueues
+ * and kthreads required by the run-time RCU grace-period mechanism have
+ * been spawned and fully initialized.  There is one of these kthreads
+ * per flavor of RCU, and they are cleaned up before boot completes.
+ */
+static int rcu_exp_boot_kthread(void *arg)
+{
+	struct rcu_state *rsp = arg;
+
+	pr_info("RCU: Starting %s boot-time expedited kthread.\n", rsp->name);
+	for (;;) {
+		swait_event(rsp->exp_boot_wq, READ_ONCE(rsp->exp_boot_go));
+		smp_mb(); /* Pre-wakeup accesses must be visible below. */
+		if (kthread_should_stop())
+			break;
+		rsp->exp_boot_go = false;
+		rcu_exp_sel_wait_wake(rsp,
+				      rsp->exp_ipi_handler, rsp->exp_boot_s);
+	}
+	pr_info("RCU: Stopping %s boot-time expedited kthread.\n", rsp->name);
+	return 0;
+}
+
+/*
+ * Create boot-time kthreads for expedited grace periods.  During the
+ * time that these kthreads are active, normal grace periods are mapped
+ * to expedited grace periods.
+ */
+void rcu_create_expedited_kthreads(void)
+{
+	struct rcu_state *rsp;
+	struct task_struct *t;
+
+	rcu_sched_state.exp_ipi_handler = sync_sched_exp_handler;
+	rcu_preempt_create_expedited_kthreads();
+	for_each_rcu_flavor(rsp) {
+		init_swait_queue_head(&rsp->exp_boot_wq);
+		t = kthread_run(rcu_exp_boot_kthread, rsp, "rcue%c", rsp->abbr);
+		BUG_ON(IS_ERR(t));
+		rsp->exp_boot_kthread = t;
+	}
+	/* Make sure state is in place before announcing new mode. */
+	smp_store_release(&rcu_expedited_till_core, true); /* ^^^ */
+	barrier(); /* Store-release before tests. */
+	rcu_test_sync_prims();
+}
+
+/*
+ * Terminate the boot-time expedited kthreads once Tree RCU has
+ * fully initialized.
+ */
+static int __init rcu_done_expedited_kthreads(void)
+{
+	struct rcu_state *rsp;
+
+	rcu_test_sync_prims();
+	/* Ensure runtime RCU mechanism in place before announcing. */
+	smp_store_release(&rcu_expedited_till_core, false); /* ^^^ */
+	for_each_rcu_flavor(rsp) {
+		/* Exclude concurrent kthread/workqueue decisions. */
+		mutex_lock(&rsp->exp_mutex);
+		rsp->exp_boot_go = true;
+		smp_mb(); /* Set "go" before woken code executes. */
+		kthread_stop(rsp->exp_boot_kthread);
+		rsp->exp_boot_kthread = NULL;
+		mutex_unlock(&rsp->exp_mutex);
+	}
+	rcu_test_sync_prims();
+	return 0;
+}
+core_initcall(rcu_done_expedited_kthreads);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 90199a2..916b7a4 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -116,6 +116,7 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held);
 #endif
 
 #ifndef CONFIG_TINY_RCU
+bool rcu_expedited_till_core;
 
 /*
  * Should expedited grace-period primitives always fall back to their
@@ -125,7 +126,9 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held);
  */
 bool rcu_gp_is_normal(void)
 {
-	return READ_ONCE(rcu_normal);
+	/* Make sure later actions ordered with state change. */
+	return READ_ONCE(rcu_normal) &&
+	       !smp_load_acquire(&rcu_expedited_till_core); /* ^^^ */
 }
 EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
 
@@ -140,7 +143,9 @@ static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1);
  */
 bool rcu_gp_is_expedited(void)
 {
-	return rcu_expedited || atomic_read(&rcu_expedited_nesting);
+	/* Make sure later actions ordered with state change. */
+	return rcu_expedited || atomic_read(&rcu_expedited_nesting) ||
+	       smp_load_acquire(&rcu_expedited_till_core); /* ^^^ */
 }
 EXPORT_SYMBOL_GPL(rcu_gp_is_expedited);
 
@@ -811,6 +816,23 @@ static void rcu_spawn_tasks_kthread(void)
 
 #endif /* #ifdef CONFIG_TASKS_RCU */
 
+/*
+ * Test each non-SRCU synchronous grace-period wait API.  This is
+ * useful just after a change in mode for these primitives, and
+ * during early boot.
+ */
+void rcu_test_sync_prims(void)
+{
+	if (IS_ENABLED(CONFIG_PROVE_RCU))
+		return;
+	synchronize_rcu();
+	synchronize_rcu_bh();
+	synchronize_sched();
+	synchronize_rcu_expedited();
+	synchronize_rcu_bh_expedited();
+	synchronize_sched_expedited();
+}
+
 #ifdef CONFIG_PROVE_RCU
 
 /*
@@ -863,13 +885,7 @@ void rcu_early_boot_tests(void)
 		early_boot_test_call_rcu_bh();
 	if (rcu_self_test_sched)
 		early_boot_test_call_rcu_sched();
-
-	synchronize_rcu();
-	synchronize_rcu_bh();
-	synchronize_sched();
-	synchronize_rcu_expedited();
-	synchronize_rcu_bh_expedited();
-	synchronize_sched_expedited();
+	rcu_test_sync_prims();
 }
 
 static int rcu_verify_early_boot_tests(void)