Merge tag 's390-5.14-2' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux

Pull more s390 updates from Vasily Gorbik:

 - Fix preempt_count initialization.

 - Rework call_on_stack() macro to add proper type handling and avoid
   possible register corruption.

 - More error prone "register asm" removal and fixes.

 - Fix syscall restarting when multiple signals are coming in. This adds
   minimalistic trampolines to vdso so we can return from signal without
   using the stack which requires pgm check handler hacks when NX is
   enabled.

 - Remove HAVE_IRQ_EXIT_ON_IRQ_STACK since this is no longer true after
   switch to generic entry.

 - Fix protected virtualization secure storage access exception
   handling.

 - Make machine check C handler always enter with DAT enabled and move
   register validation to C code.

 - Fix tinyconfig boot problem by avoiding MONITOR CALL without
   CONFIG_BUG.

 - Increase asm symbols alignment to 16 to make it consistent with
   compilers.

 - Enable concurrent access to the CPU Measurement Counter Facility.

 - Add support for dynamic AP bus size limit and rework ap_dqap to deal
   with messages greater than recv buffer.

* tag 's390-5.14-2' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux: (41 commits)
  s390: preempt: Fix preempt_count initialization
  s390/linkage: increase asm symbols alignment to 16
  s390: rename CALL_ON_STACK_NORETURN() to call_on_stack_noreturn()
  s390: add type checking to CALL_ON_STACK_NORETURN() macro
  s390: remove old CALL_ON_STACK() macro
  s390/softirq: use call_on_stack() macro
  s390/lib: use call_on_stack() macro
  s390/smp: use call_on_stack() macro
  s390/kexec: use call_on_stack() macro
  s390/irq: use call_on_stack() macro
  s390/mm: use call_on_stack() macro
  s390: introduce proper type handling call_on_stack() macro
  s390/irq: simplify on_async_stack()
  s390/irq: inline do_softirq_own_stack()
  s390/irq: simplify do_softirq_own_stack()
  s390/ap: get rid of register asm in ap_dqap()
  s390: rename PIF_SYSCALL_RESTART to PIF_EXECVE_PGSTE_RESTART
  s390: move restart of execve() syscall
  s390/signal: remove sigreturn on stack
  s390/signal: switch to using vdso for sigreturn and syscall restart
  ...
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 07b2328..a0e2130 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -163,7 +163,6 @@
 	select HAVE_GCC_PLUGINS
 	select HAVE_GENERIC_VDSO
 	select HAVE_IOREMAP_PROT if PCI
-	select HAVE_IRQ_EXIT_ON_IRQ_STACK
 	select HAVE_KERNEL_BZIP2
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZ4
@@ -438,6 +437,7 @@
 	select COMPAT_OLD_SIGACTION
 	select HAVE_UID16
 	depends on MULTIUSER
+	depends on !CC_IS_CLANG
 	help
 	  Select this option if you want to enable your system kernel to
 	  handle system-calls from ELF binaries for 31 bit ESA.  This option
diff --git a/arch/s390/Makefile b/arch/s390/Makefile
index 098abe3..95c75e6 100644
--- a/arch/s390/Makefile
+++ b/arch/s390/Makefile
@@ -166,6 +166,19 @@
 archprepare:
 	$(Q)$(MAKE) $(build)=$(syscalls) kapi
 	$(Q)$(MAKE) $(build)=$(tools) kapi
+ifeq ($(KBUILD_EXTMOD),)
+# We need to generate vdso-offsets.h before compiling certain files in kernel/.
+# In order to do that, we should use the archprepare target, but we can't since
+# asm-offsets.h is included in some files used to generate vdso-offsets.h, and
+# asm-offsets.h is built in prepare0, for which archprepare is a dependency.
+# Therefore we need to generate the header after prepare0 has been made, hence
+# this hack.
+prepare: vdso_prepare
+vdso_prepare: prepare0
+	$(Q)$(MAKE) $(build)=arch/s390/kernel/vdso64 include/generated/vdso64-offsets.h
+	$(if $(CONFIG_COMPAT),$(Q)$(MAKE) \
+		$(build)=arch/s390/kernel/vdso32 include/generated/vdso32-offsets.h)
+endif
 
 # Don't use tabs in echo arguments
 define archhelp
diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c
index bbe4df6..d0cf216 100644
--- a/arch/s390/boot/startup.c
+++ b/arch/s390/boot/startup.c
@@ -23,6 +23,7 @@
 unsigned long __bootdata_preserved(MODULES_VADDR);
 unsigned long __bootdata_preserved(MODULES_END);
 unsigned long __bootdata(ident_map_size);
+int __bootdata(is_full_image) = 1;
 
 u64 __bootdata_preserved(stfle_fac_list[16]);
 u64 __bootdata_preserved(alt_stfle_fac_list[16]);
diff --git a/arch/s390/boot/uv.c b/arch/s390/boot/uv.c
index 82b99b9..f6b0c4f 100644
--- a/arch/s390/boot/uv.c
+++ b/arch/s390/boot/uv.c
@@ -36,6 +36,7 @@
 		uv_info.max_sec_stor_addr = ALIGN(uvcb.max_guest_stor_addr, PAGE_SIZE);
 		uv_info.max_num_sec_conf = uvcb.max_num_sec_conf;
 		uv_info.max_guest_cpu_id = uvcb.max_guest_cpu_id;
+		uv_info.uv_feature_indications = uvcb.uv_feature_indications;
 	}
 
 #ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
diff --git a/arch/s390/include/asm/ap.h b/arch/s390/include/asm/ap.h
index 837d169..3afbee2 100644
--- a/arch/s390/include/asm/ap.h
+++ b/arch/s390/include/asm/ap.h
@@ -53,18 +53,20 @@
  */
 static inline bool ap_instructions_available(void)
 {
-	register unsigned long reg0 asm ("0") = AP_MKQID(0, 0);
-	register unsigned long reg1 asm ("1") = 0;
-	register unsigned long reg2 asm ("2") = 0;
+	unsigned long reg0 = AP_MKQID(0, 0);
+	unsigned long reg1 = 0;
 
 	asm volatile(
-		"   .long 0xb2af0000\n"		/* PQAP(TAPQ) */
-		"0: la    %0,1\n"
+		"	lgr	0,%[reg0]\n"   /* qid into gr0 */
+		"	lghi	1,0\n"	       /* 0 into gr1 */
+		"	lghi	2,0\n"	       /* 0 into gr2 */
+		"	.long	0xb2af0000\n"  /* PQAP(TAPQ) */
+		"0:	la	%[reg1],1\n"   /* 1 into reg1 */
 		"1:\n"
 		EX_TABLE(0b, 1b)
-		: "+d" (reg1), "+d" (reg2)
-		: "d" (reg0)
-		: "cc");
+		: [reg1] "+&d" (reg1)
+		: [reg0] "d" (reg0)
+		: "cc", "0", "1", "2");
 	return reg1 != 0;
 }
 
@@ -77,14 +79,18 @@
  */
 static inline struct ap_queue_status ap_tapq(ap_qid_t qid, unsigned long *info)
 {
-	register unsigned long reg0 asm ("0") = qid;
-	register struct ap_queue_status reg1 asm ("1");
-	register unsigned long reg2 asm ("2");
+	struct ap_queue_status reg1;
+	unsigned long reg2;
 
-	asm volatile(".long 0xb2af0000"		/* PQAP(TAPQ) */
-		     : "=d" (reg1), "=d" (reg2)
-		     : "d" (reg0)
-		     : "cc");
+	asm volatile(
+		"	lgr	0,%[qid]\n"    /* qid into gr0 */
+		"	lghi	2,0\n"	       /* 0 into gr2 */
+		"	.long	0xb2af0000\n"  /* PQAP(TAPQ) */
+		"	lgr	%[reg1],1\n"   /* gr1 (status) into reg1 */
+		"	lgr	%[reg2],2\n"   /* gr2 into reg2 */
+		: [reg1] "=&d" (reg1), [reg2] "=&d" (reg2)
+		: [qid] "d" (qid)
+		: "cc", "0", "1", "2");
 	if (info)
 		*info = reg2;
 	return reg1;
@@ -115,14 +121,16 @@
  */
 static inline struct ap_queue_status ap_rapq(ap_qid_t qid)
 {
-	register unsigned long reg0 asm ("0") = qid | (1UL << 24);
-	register struct ap_queue_status reg1 asm ("1");
+	unsigned long reg0 = qid | (1UL << 24);  /* fc 1UL is RAPQ */
+	struct ap_queue_status reg1;
 
 	asm volatile(
-		".long 0xb2af0000"		/* PQAP(RAPQ) */
-		: "=d" (reg1)
-		: "d" (reg0)
-		: "cc");
+		"	lgr	0,%[reg0]\n"  /* qid arg into gr0 */
+		"	.long	0xb2af0000\n" /* PQAP(RAPQ) */
+		"	lgr	%[reg1],1\n"  /* gr1 (status) into reg1 */
+		: [reg1] "=&d" (reg1)
+		: [reg0] "d" (reg0)
+		: "cc", "0", "1");
 	return reg1;
 }
 
@@ -134,14 +142,16 @@
  */
 static inline struct ap_queue_status ap_zapq(ap_qid_t qid)
 {
-	register unsigned long reg0 asm ("0") = qid | (2UL << 24);
-	register struct ap_queue_status reg1 asm ("1");
+	unsigned long reg0 = qid | (2UL << 24);  /* fc 2UL is ZAPQ */
+	struct ap_queue_status reg1;
 
 	asm volatile(
-		".long 0xb2af0000"		/* PQAP(ZAPQ) */
-		: "=d" (reg1)
-		: "d" (reg0)
-		: "cc");
+		"	lgr	0,%[reg0]\n"   /* qid arg into gr0 */
+		"	.long	0xb2af0000\n"  /* PQAP(ZAPQ) */
+		"	lgr	%[reg1],1\n"   /* gr1 (status) into reg1 */
+		: [reg1] "=&d" (reg1)
+		: [reg0] "d" (reg0)
+		: "cc", "0", "1");
 	return reg1;
 }
 
@@ -172,18 +182,20 @@
  */
 static inline int ap_qci(struct ap_config_info *config)
 {
-	register unsigned long reg0 asm ("0") = 4UL << 24;
-	register unsigned long reg1 asm ("1") = -EOPNOTSUPP;
-	register struct ap_config_info *reg2 asm ("2") = config;
+	unsigned long reg0 = 4UL << 24;  /* fc 4UL is QCI */
+	unsigned long reg1 = -EOPNOTSUPP;
+	struct ap_config_info *reg2 = config;
 
 	asm volatile(
-		".long 0xb2af0000\n"		/* PQAP(QCI) */
-		"0: la    %0,0\n"
+		"	lgr	0,%[reg0]\n"   /* QCI fc into gr0 */
+		"	lgr	2,%[reg2]\n"   /* ptr to config into gr2 */
+		"	.long	0xb2af0000\n"  /* PQAP(QCI) */
+		"0:	la	%[reg1],0\n"   /* good case, QCI fc available */
 		"1:\n"
 		EX_TABLE(0b, 1b)
-		: "+d" (reg1)
-		: "d" (reg0), "d" (reg2)
-		: "cc", "memory");
+		: [reg1] "+&d" (reg1)
+		: [reg0] "d" (reg0), [reg2] "d" (reg2)
+		: "cc", "memory", "0", "2");
 
 	return reg1;
 }
@@ -220,21 +232,25 @@
 					     struct ap_qirq_ctrl qirqctrl,
 					     void *ind)
 {
-	register unsigned long reg0 asm ("0") = qid | (3UL << 24);
-	register union {
+	unsigned long reg0 = qid | (3UL << 24);  /* fc 3UL is AQIC */
+	union {
 		unsigned long value;
 		struct ap_qirq_ctrl qirqctrl;
 		struct ap_queue_status status;
-	} reg1 asm ("1");
-	register void *reg2 asm ("2") = ind;
+	} reg1;
+	void *reg2 = ind;
 
 	reg1.qirqctrl = qirqctrl;
 
 	asm volatile(
-		".long 0xb2af0000"		/* PQAP(AQIC) */
-		: "+d" (reg1)
-		: "d" (reg0), "d" (reg2)
-		: "cc");
+		"	lgr	0,%[reg0]\n"   /* qid param into gr0 */
+		"	lgr	1,%[reg1]\n"   /* irq ctrl into gr1 */
+		"	lgr	2,%[reg2]\n"   /* ni addr into gr2 */
+		"	.long	0xb2af0000\n"  /* PQAP(AQIC) */
+		"	lgr	%[reg1],1\n"   /* gr1 (status) into reg1 */
+		: [reg1] "+&d" (reg1)
+		: [reg0] "d" (reg0), [reg2] "d" (reg2)
+		: "cc", "0", "1", "2");
 
 	return reg1.status;
 }
@@ -268,21 +284,24 @@
 static inline struct ap_queue_status ap_qact(ap_qid_t qid, int ifbit,
 					     union ap_qact_ap_info *apinfo)
 {
-	register unsigned long reg0 asm ("0") = qid | (5UL << 24)
-		| ((ifbit & 0x01) << 22);
-	register union {
+	unsigned long reg0 = qid | (5UL << 24) | ((ifbit & 0x01) << 22);
+	union {
 		unsigned long value;
 		struct ap_queue_status status;
-	} reg1 asm ("1");
-	register unsigned long reg2 asm ("2");
+	} reg1;
+	unsigned long reg2;
 
 	reg1.value = apinfo->val;
 
 	asm volatile(
-		".long 0xb2af0000"		/* PQAP(QACT) */
-		: "+d" (reg1), "=d" (reg2)
-		: "d" (reg0)
-		: "cc");
+		"	lgr	0,%[reg0]\n"   /* qid param into gr0 */
+		"	lgr	1,%[reg1]\n"   /* qact in info into gr1 */
+		"	.long	0xb2af0000\n"  /* PQAP(QACT) */
+		"	lgr	%[reg1],1\n"   /* gr1 (status) into reg1 */
+		"	lgr	%[reg2],2\n"   /* qact out info into reg2 */
+		: [reg1] "+&d" (reg1), [reg2] "=&d" (reg2)
+		: [reg0] "d" (reg0)
+		: "cc", "0", "1", "2");
 	apinfo->val = reg2;
 	return reg1.status;
 }
@@ -303,19 +322,24 @@
 					     unsigned long long psmid,
 					     void *msg, size_t length)
 {
-	register unsigned long reg0 asm ("0") = qid | 0x40000000UL;
-	register struct ap_queue_status reg1 asm ("1");
-	register unsigned long reg2 asm ("2") = (unsigned long) msg;
-	register unsigned long reg3 asm ("3") = (unsigned long) length;
-	register unsigned long reg4 asm ("4") = (unsigned int) (psmid >> 32);
-	register unsigned long reg5 asm ("5") = psmid & 0xffffffff;
+	unsigned long reg0 = qid | 0x40000000UL;  /* 0x4... is last msg part */
+	union register_pair nqap_r1, nqap_r2;
+	struct ap_queue_status reg1;
+
+	nqap_r1.even = (unsigned int)(psmid >> 32);
+	nqap_r1.odd  = psmid & 0xffffffff;
+	nqap_r2.even = (unsigned long)msg;
+	nqap_r2.odd  = (unsigned long)length;
 
 	asm volatile (
-		"0: .long 0xb2ad0042\n"		/* NQAP */
-		"   brc   2,0b"
-		: "+d" (reg0), "=d" (reg1), "+d" (reg2), "+d" (reg3)
-		: "d" (reg4), "d" (reg5)
-		: "cc", "memory");
+		"	lgr	0,%[reg0]\n"  /* qid param in gr0 */
+		"0:	.insn	rre,0xb2ad0000,%[nqap_r1],%[nqap_r2]\n"
+		"	brc	2,0b\n"       /* handle partial completion */
+		"	lgr	%[reg1],1\n"  /* gr1 (status) into reg1 */
+		: [reg0] "+&d" (reg0), [reg1] "=&d" (reg1),
+		  [nqap_r2] "+&d" (nqap_r2.pair)
+		: [nqap_r1] "d" (nqap_r1.pair)
+		: "cc", "memory", "0", "1");
 	return reg1;
 }
 
@@ -325,6 +349,8 @@
  * @psmid: Pointer to program supplied message identifier
  * @msg: The message text
  * @length: The message length
+ * @reslength: Resitual length on return
+ * @resgr0: input: gr0 value (only used if != 0), output: resitual gr0 content
  *
  * Returns AP queue status structure.
  * Condition code 1 on DQAP means the receive has taken place
@@ -336,27 +362,65 @@
  * Note that gpr2 is used by the DQAP instruction to keep track of
  * any 'residual' length, in case the instruction gets interrupted.
  * Hence it gets zeroed before the instruction.
+ * If the message does not fit into the buffer, this function will
+ * return with a truncated message and the reply in the firmware queue
+ * is not removed. This is indicated to the caller with an
+ * ap_queue_status response_code value of all bits on (0xFF) and (if
+ * the reslength ptr is given) the remaining length is stored in
+ * *reslength and (if the resgr0 ptr is given) the updated gr0 value
+ * for further processing of this msg entry is stored in *resgr0. The
+ * caller needs to detect this situation and should invoke ap_dqap
+ * with a valid resgr0 ptr and a value in there != 0 to indicate that
+ * *resgr0 is to be used instead of qid to further process this entry.
  */
 static inline struct ap_queue_status ap_dqap(ap_qid_t qid,
 					     unsigned long long *psmid,
-					     void *msg, size_t length)
+					     void *msg, size_t length,
+					     size_t *reslength,
+					     unsigned long *resgr0)
 {
-	register unsigned long reg0 asm("0") = qid | 0x80000000UL;
-	register struct ap_queue_status reg1 asm ("1");
-	register unsigned long reg2 asm("2") = 0UL;
-	register unsigned long reg4 asm("4") = (unsigned long) msg;
-	register unsigned long reg5 asm("5") = (unsigned long) length;
-	register unsigned long reg6 asm("6") = 0UL;
-	register unsigned long reg7 asm("7") = 0UL;
+	unsigned long reg0 = resgr0 && *resgr0 ? *resgr0 : qid | 0x80000000UL;
+	struct ap_queue_status reg1;
+	unsigned long reg2;
+	union register_pair rp1, rp2;
 
+	rp1.even = 0UL;
+	rp1.odd  = 0UL;
+	rp2.even = (unsigned long)msg;
+	rp2.odd  = (unsigned long)length;
 
 	asm volatile(
-		"0: .long 0xb2ae0064\n"		/* DQAP */
-		"   brc   6,0b\n"
-		: "+d" (reg0), "=d" (reg1), "+d" (reg2),
-		  "+d" (reg4), "+d" (reg5), "+d" (reg6), "+d" (reg7)
-		: : "cc", "memory");
-	*psmid = (((unsigned long long) reg6) << 32) + reg7;
+		"	lgr	0,%[reg0]\n"   /* qid param into gr0 */
+		"	lghi	2,0\n"	       /* 0 into gr2 (res length) */
+		"0:	ltgr	%N[rp2],%N[rp2]\n" /* check buf len */
+		"	jz	2f\n"	       /* go out if buf len is 0 */
+		"1:	.insn	rre,0xb2ae0000,%[rp1],%[rp2]\n"
+		"	brc	6,0b\n"        /* handle partial complete */
+		"2:	lgr	%[reg0],0\n"   /* gr0 (qid + info) into reg0 */
+		"	lgr	%[reg1],1\n"   /* gr1 (status) into reg1 */
+		"	lgr	%[reg2],2\n"   /* gr2 (res length) into reg2 */
+		: [reg0] "+&d" (reg0), [reg1] "=&d" (reg1), [reg2] "=&d" (reg2),
+		  [rp1] "+&d" (rp1.pair), [rp2] "+&d" (rp2.pair)
+		:
+		: "cc", "memory", "0", "1", "2");
+
+	if (reslength)
+		*reslength = reg2;
+	if (reg2 != 0 && rp2.odd == 0) {
+		/*
+		 * Partially complete, status in gr1 is not set.
+		 * Signal the caller that this dqap is only partially received
+		 * with a special status response code 0xFF and *resgr0 updated
+		 */
+		reg1.response_code = 0xFF;
+		if (resgr0)
+			*resgr0 = reg0;
+	} else {
+		*psmid = (((unsigned long long)rp1.even) << 32) + rp1.odd;
+		if (resgr0)
+			*resgr0 = 0;
+	}
+
 	return reg1;
 }
 
diff --git a/arch/s390/include/asm/cpu_mcf.h b/arch/s390/include/asm/cpu_mcf.h
index 4dcefdd..ca0e0e5 100644
--- a/arch/s390/include/asm/cpu_mcf.h
+++ b/arch/s390/include/asm/cpu_mcf.h
@@ -32,39 +32,22 @@
 	[CPUMF_CTR_SET_MT_DIAG] = 0x20,
 };
 
-static inline void ctr_set_enable(u64 *state, int ctr_set)
-{
-	*state |= cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ENABLE_SHIFT;
-}
-static inline void ctr_set_disable(u64 *state, int ctr_set)
-{
-	*state &= ~(cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ENABLE_SHIFT);
-}
-static inline void ctr_set_start(u64 *state, int ctr_set)
-{
-	*state |= cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ACTCTL_SHIFT;
-}
-static inline void ctr_set_stop(u64 *state, int ctr_set)
-{
-	*state &= ~(cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ACTCTL_SHIFT);
-}
-
-static inline void ctr_set_multiple_enable(u64 *state, u64 ctrsets)
+static inline void ctr_set_enable(u64 *state, u64 ctrsets)
 {
 	*state |= ctrsets << CPUMF_LCCTL_ENABLE_SHIFT;
 }
 
-static inline void ctr_set_multiple_disable(u64 *state, u64 ctrsets)
+static inline void ctr_set_disable(u64 *state, u64 ctrsets)
 {
 	*state &= ~(ctrsets << CPUMF_LCCTL_ENABLE_SHIFT);
 }
 
-static inline void ctr_set_multiple_start(u64 *state, u64 ctrsets)
+static inline void ctr_set_start(u64 *state, u64 ctrsets)
 {
 	*state |= ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT;
 }
 
-static inline void ctr_set_multiple_stop(u64 *state, u64 ctrsets)
+static inline void ctr_set_stop(u64 *state, u64 ctrsets)
 {
 	*state &= ~(ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT);
 }
@@ -92,8 +75,15 @@
 	struct cpumf_ctr_info	info;
 	atomic_t		ctr_set[CPUMF_CTR_SET_MAX];
 	atomic64_t		alert;
-	u64			state;
+	u64			state;		/* For perf_event_open SVC */
+	u64			dev_state;	/* For /dev/hwctr */
 	unsigned int		flags;
+	size_t used;			/* Bytes used in data */
+	size_t usedss;			/* Bytes used in start/stop */
+	unsigned char start[PAGE_SIZE];	/* Counter set at event add */
+	unsigned char stop[PAGE_SIZE];	/* Counter set at event delete */
+	unsigned char data[PAGE_SIZE];	/* Counter set at /dev/hwctr */
+	unsigned int sets;		/* # Counter set saved in memory */
 };
 DECLARE_PER_CPU(struct cpu_cf_events, cpu_cf_events);
 
@@ -124,4 +114,6 @@
 
 size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset,
 			   struct cpumf_ctr_info *info);
+int cfset_online_cpu(unsigned int cpu);
+int cfset_offline_cpu(unsigned int cpu);
 #endif /* _ASM_S390_CPU_MCF_H */
diff --git a/arch/s390/include/asm/ctl_reg.h b/arch/s390/include/asm/ctl_reg.h
index ed5efbb..adc0179 100644
--- a/arch/s390/include/asm/ctl_reg.h
+++ b/arch/s390/include/asm/ctl_reg.h
@@ -21,8 +21,6 @@
 #define CR0_INTERRUPT_KEY_SUBMASK	BIT(63 - 57)
 #define CR0_MEASUREMENT_ALERT_SUBMASK	BIT(63 - 58)
 
-#define CR2_GUARDED_STORAGE		BIT(63 - 59)
-
 #define CR14_UNUSED_32			BIT(63 - 32)
 #define CR14_UNUSED_33			BIT(63 - 33)
 #define CR14_CHANNEL_REPORT_SUBMASK	BIT(63 - 35)
diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h
index 66d51ad..bd00c94 100644
--- a/arch/s390/include/asm/elf.h
+++ b/arch/s390/include/asm/elf.h
@@ -144,10 +144,6 @@
 #include <linux/sched/mm.h>	/* for task_struct */
 #include <asm/mmu_context.h>
 
-#include <asm/vdso.h>
-
-extern unsigned int vdso_enabled;
-
 /*
  * This is used to ensure we don't load something for the wrong architecture.
  */
@@ -176,7 +172,7 @@
 	    !current->mm->context.alloc_pgste) {		\
 		set_thread_flag(TIF_PGSTE);			\
 		set_pt_regs_flag(task_pt_regs(current),		\
-				 PIF_SYSCALL_RESTART);		\
+				 PIF_EXECVE_PGSTE_RESTART);	\
 		_state->rc = -EAGAIN;				\
 	}							\
 	_state->rc;						\
@@ -268,11 +264,10 @@
 #define STACK_RND_MASK	MMAP_RND_MASK
 
 /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */
-#define ARCH_DLINFO							    \
-do {									    \
-	if (vdso_enabled)						    \
-		NEW_AUX_ENT(AT_SYSINFO_EHDR,				    \
-			    (unsigned long)current->mm->context.vdso_base); \
+#define ARCH_DLINFO							\
+do {									\
+	NEW_AUX_ENT(AT_SYSINFO_EHDR,					\
+		    (unsigned long)current->mm->context.vdso_base);	\
 } while (0)
 
 struct linux_binprm;
diff --git a/arch/s390/include/asm/entry-common.h b/arch/s390/include/asm/entry-common.h
index baa8005..17aead8 100644
--- a/arch/s390/include/asm/entry-common.h
+++ b/arch/s390/include/asm/entry-common.h
@@ -14,7 +14,6 @@
 #define ARCH_EXIT_TO_USER_MODE_WORK (_TIF_GUARDED_STORAGE | _TIF_PER_TRAP)
 
 void do_per_trap(struct pt_regs *regs);
-void do_syscall(struct pt_regs *regs);
 
 #ifdef CONFIG_DEBUG_ENTRY
 static __always_inline void arch_check_user_regs(struct pt_regs *regs)
diff --git a/arch/s390/include/asm/linkage.h b/arch/s390/include/asm/linkage.h
index a0a7a2c..24e8fed 100644
--- a/arch/s390/include/asm/linkage.h
+++ b/arch/s390/include/asm/linkage.h
@@ -5,7 +5,7 @@
 #include <asm/asm-const.h>
 #include <linux/stringify.h>
 
-#define __ALIGN .align 4, 0x07
+#define __ALIGN .align 16, 0x07
 #define __ALIGN_STR __stringify(__ALIGN)
 
 /*
diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h
index 20e51c9..2db45d7 100644
--- a/arch/s390/include/asm/nmi.h
+++ b/arch/s390/include/asm/nmi.h
@@ -23,12 +23,16 @@
 #define MCCK_CODE_SYSTEM_DAMAGE		BIT(63)
 #define MCCK_CODE_EXT_DAMAGE		BIT(63 - 5)
 #define MCCK_CODE_CP			BIT(63 - 9)
-#define MCCK_CODE_CPU_TIMER_VALID	BIT(63 - 46)
+#define MCCK_CODE_STG_ERROR		BIT(63 - 16)
+#define MCCK_CODE_STG_KEY_ERROR		BIT(63 - 18)
+#define MCCK_CODE_STG_DEGRAD		BIT(63 - 19)
 #define MCCK_CODE_PSW_MWP_VALID		BIT(63 - 20)
 #define MCCK_CODE_PSW_IA_VALID		BIT(63 - 23)
+#define MCCK_CODE_STG_FAIL_ADDR		BIT(63 - 24)
 #define MCCK_CODE_CR_VALID		BIT(63 - 29)
 #define MCCK_CODE_GS_VALID		BIT(63 - 36)
 #define MCCK_CODE_FC_VALID		BIT(63 - 43)
+#define MCCK_CODE_CPU_TIMER_VALID	BIT(63 - 46)
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/s390/include/asm/preempt.h b/arch/s390/include/asm/preempt.h
index 23ff51b..d9d5350 100644
--- a/arch/s390/include/asm/preempt.h
+++ b/arch/s390/include/asm/preempt.h
@@ -29,12 +29,6 @@
 				  old, new) != old);
 }
 
-#define init_task_preempt_count(p)	do { } while (0)
-
-#define init_idle_preempt_count(p, cpu)	do { \
-	S390_lowcore.preempt_count = PREEMPT_DISABLED; \
-} while (0)
-
 static inline void set_preempt_need_resched(void)
 {
 	__atomic_and(~PREEMPT_NEED_RESCHED, &S390_lowcore.preempt_count);
@@ -88,12 +82,6 @@
 	S390_lowcore.preempt_count = pc;
 }
 
-#define init_task_preempt_count(p)	do { } while (0)
-
-#define init_idle_preempt_count(p, cpu)	do { \
-	S390_lowcore.preempt_count = PREEMPT_DISABLED; \
-} while (0)
-
 static inline void set_preempt_need_resched(void)
 {
 }
@@ -130,6 +118,10 @@
 
 #endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */
 
+#define init_task_preempt_count(p)	do { } while (0)
+/* Deferred to CPU bringup time */
+#define init_idle_preempt_count(p, cpu)	do { } while (0)
+
 #ifdef CONFIG_PREEMPTION
 extern void preempt_schedule(void);
 #define __preempt_schedule() preempt_schedule()
diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h
index c7850d6..61b22aa 100644
--- a/arch/s390/include/asm/ptrace.h
+++ b/arch/s390/include/asm/ptrace.h
@@ -11,15 +11,15 @@
 #include <uapi/asm/ptrace.h>
 #include <asm/tpi.h>
 
-#define PIF_SYSCALL		0	/* inside a system call */
-#define PIF_SYSCALL_RESTART	1	/* restart the current system call */
-#define PIF_SYSCALL_RET_SET	2	/* return value was set via ptrace */
-#define PIF_GUEST_FAULT		3	/* indicates program check in sie64a */
+#define PIF_SYSCALL			0	/* inside a system call */
+#define PIF_EXECVE_PGSTE_RESTART	1	/* restart execve for PGSTE binaries */
+#define PIF_SYSCALL_RET_SET		2	/* return value was set via ptrace */
+#define PIF_GUEST_FAULT			3	/* indicates program check in sie64a */
 
-#define _PIF_SYSCALL		BIT(PIF_SYSCALL)
-#define _PIF_SYSCALL_RESTART	BIT(PIF_SYSCALL_RESTART)
-#define _PIF_SYSCALL_RET_SET	BIT(PIF_SYSCALL_RET_SET)
-#define _PIF_GUEST_FAULT	BIT(PIF_GUEST_FAULT)
+#define _PIF_SYSCALL			BIT(PIF_SYSCALL)
+#define _PIF_EXECVE_PGSTE_RESTART	BIT(PIF_EXECVE_PGSTE_RESTART)
+#define _PIF_SYSCALL_RET_SET		BIT(PIF_SYSCALL_RET_SET)
+#define _PIF_GUEST_FAULT		BIT(PIF_GUEST_FAULT)
 
 #ifndef __ASSEMBLY__
 
@@ -162,6 +162,14 @@
 	return !!(regs->flags & (1UL << flag));
 }
 
+static inline int test_and_clear_pt_regs_flag(struct pt_regs *regs, int flag)
+{
+	int ret = test_pt_regs_flag(regs, flag);
+
+	clear_pt_regs_flag(regs, flag);
+	return ret;
+}
+
 /*
  * These are defined as per linux/ptrace.h, which see.
  */
diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h
index a8b75da..3a77aa9 100644
--- a/arch/s390/include/asm/setup.h
+++ b/arch/s390/include/asm/setup.h
@@ -159,6 +159,8 @@
 	return __kaslr_offset;
 }
 
+extern int is_full_image;
+
 static inline u32 gen_lpswe(unsigned long addr)
 {
 	BUILD_BUG_ON(addr > 0xfff);
diff --git a/arch/s390/include/asm/softirq_stack.h b/arch/s390/include/asm/softirq_stack.h
new file mode 100644
index 0000000..fd17f25
--- /dev/null
+++ b/arch/s390/include/asm/softirq_stack.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef __ASM_S390_SOFTIRQ_STACK_H
+#define __ASM_S390_SOFTIRQ_STACK_H
+
+#include <asm/lowcore.h>
+#include <asm/stacktrace.h>
+
+static inline void do_softirq_own_stack(void)
+{
+	call_on_stack(0, S390_lowcore.async_stack, void, __do_softirq);
+}
+
+#endif /* __ASM_S390_SOFTIRQ_STACK_H */
diff --git a/arch/s390/include/asm/stacktrace.h b/arch/s390/include/asm/stacktrace.h
index 76c6034..3d8a4b9 100644
--- a/arch/s390/include/asm/stacktrace.h
+++ b/arch/s390/include/asm/stacktrace.h
@@ -74,23 +74,6 @@
 	((unsigned long)__builtin_frame_address(0) -			\
 	 offsetof(struct stack_frame, back_chain))
 
-#define CALL_ARGS_0()							\
-	register unsigned long r2 asm("2")
-#define CALL_ARGS_1(arg1)						\
-	register unsigned long r2 asm("2") = (unsigned long)(arg1)
-#define CALL_ARGS_2(arg1, arg2)						\
-	CALL_ARGS_1(arg1);						\
-	register unsigned long r3 asm("3") = (unsigned long)(arg2)
-#define CALL_ARGS_3(arg1, arg2, arg3)					\
-	CALL_ARGS_2(arg1, arg2);					\
-	register unsigned long r4 asm("4") = (unsigned long)(arg3)
-#define CALL_ARGS_4(arg1, arg2, arg3, arg4)				\
-	CALL_ARGS_3(arg1, arg2, arg3);					\
-	register unsigned long r4 asm("5") = (unsigned long)(arg4)
-#define CALL_ARGS_5(arg1, arg2, arg3, arg4, arg5)			\
-	CALL_ARGS_4(arg1, arg2, arg3, arg4);				\
-	register unsigned long r4 asm("6") = (unsigned long)(arg5)
-
 /*
  * To keep this simple mark register 2-6 as being changed (volatile)
  * by the called function, even though register 6 is saved/nonvolatile.
@@ -109,34 +92,113 @@
 #define CALL_CLOBBER_1 CALL_CLOBBER_2, "3"
 #define CALL_CLOBBER_0 CALL_CLOBBER_1
 
-#define CALL_ON_STACK(fn, stack, nr, args...)				\
+#define CALL_LARGS_0(...)						\
+	long dummy = 0
+#define CALL_LARGS_1(t1, a1)						\
+	long arg1  = (long)(t1)(a1)
+#define CALL_LARGS_2(t1, a1, t2, a2)					\
+	CALL_LARGS_1(t1, a1);						\
+	long arg2 = (long)(t2)(a2)
+#define CALL_LARGS_3(t1, a1, t2, a2, t3, a3)				\
+	CALL_LARGS_2(t1, a1, t2, a2);					\
+	long arg3 = (long)(t3)(a3)
+#define CALL_LARGS_4(t1, a1, t2, a2, t3, a3, t4, a4)			\
+	CALL_LARGS_3(t1, a1, t2, a2, t3, a3);				\
+	long arg4  = (long)(t4)(a4)
+#define CALL_LARGS_5(t1, a1, t2, a2, t3, a3, t4, a4, t5, a5)		\
+	CALL_LARGS_4(t1, a1, t2, a2, t3, a3, t4, a4);			\
+	long arg5 = (long)(t5)(a5)
+
+#define CALL_REGS_0							\
+	register long r2 asm("2") = dummy
+#define CALL_REGS_1							\
+	register long r2 asm("2") = arg1
+#define CALL_REGS_2							\
+	CALL_REGS_1;							\
+	register long r3 asm("3") = arg2
+#define CALL_REGS_3							\
+	CALL_REGS_2;							\
+	register long r4 asm("4") = arg3
+#define CALL_REGS_4							\
+	CALL_REGS_3;							\
+	register long r5 asm("5") = arg4
+#define CALL_REGS_5							\
+	CALL_REGS_4;							\
+	register long r6 asm("6") = arg5
+
+#define CALL_TYPECHECK_0(...)
+#define CALL_TYPECHECK_1(t, a, ...)					\
+	typecheck(t, a)
+#define CALL_TYPECHECK_2(t, a, ...)					\
+	CALL_TYPECHECK_1(__VA_ARGS__);					\
+	typecheck(t, a)
+#define CALL_TYPECHECK_3(t, a, ...)					\
+	CALL_TYPECHECK_2(__VA_ARGS__);					\
+	typecheck(t, a)
+#define CALL_TYPECHECK_4(t, a, ...)					\
+	CALL_TYPECHECK_3(__VA_ARGS__);					\
+	typecheck(t, a)
+#define CALL_TYPECHECK_5(t, a, ...)					\
+	CALL_TYPECHECK_4(__VA_ARGS__);					\
+	typecheck(t, a)
+
+#define CALL_PARM_0(...) void
+#define CALL_PARM_1(t, a, ...) t
+#define CALL_PARM_2(t, a, ...) t, CALL_PARM_1(__VA_ARGS__)
+#define CALL_PARM_3(t, a, ...) t, CALL_PARM_2(__VA_ARGS__)
+#define CALL_PARM_4(t, a, ...) t, CALL_PARM_3(__VA_ARGS__)
+#define CALL_PARM_5(t, a, ...) t, CALL_PARM_4(__VA_ARGS__)
+#define CALL_PARM_6(t, a, ...) t, CALL_PARM_5(__VA_ARGS__)
+
+/*
+ * Use call_on_stack() to call a function switching to a specified
+ * stack. Proper sign and zero extension of function arguments is
+ * done. Usage:
+ *
+ * rc = call_on_stack(nr, stack, rettype, fn, t1, a1, t2, a2, ...)
+ *
+ * - nr specifies the number of function arguments of fn.
+ * - stack specifies the stack to be used.
+ * - fn is the function to be called.
+ * - rettype is the return type of fn.
+ * - t1, a1, ... are pairs, where t1 must match the type of the first
+ *   argument of fn, t2 the second, etc. a1 is the corresponding
+ *   first function argument (not name), etc.
+ */
+#define call_on_stack(nr, stack, rettype, fn, ...)			\
 ({									\
+	rettype (*__fn)(CALL_PARM_##nr(__VA_ARGS__)) = fn;		\
 	unsigned long frame = current_frame_address();			\
-	CALL_ARGS_##nr(args);						\
+	unsigned long __stack = stack;					\
 	unsigned long prev;						\
+	CALL_LARGS_##nr(__VA_ARGS__);					\
+	CALL_REGS_##nr;							\
 									\
+	CALL_TYPECHECK_##nr(__VA_ARGS__);				\
 	asm volatile(							\
-		"	la	%[_prev],0(15)\n"			\
+		"	lgr	%[_prev],15\n"				\
 		"	lg	15,%[_stack]\n"				\
 		"	stg	%[_frame],%[_bc](15)\n"			\
 		"	brasl	14,%[_fn]\n"				\
-		"	la	15,0(%[_prev])\n"			\
-		: [_prev] "=&a" (prev), CALL_FMT_##nr			\
-		: [_stack] "R" (stack),					\
+		"	lgr	15,%[_prev]\n"				\
+		: [_prev] "=&d" (prev), CALL_FMT_##nr			\
+		: [_stack] "R" (__stack),				\
 		  [_bc] "i" (offsetof(struct stack_frame, back_chain)),	\
 		  [_frame] "d" (frame),					\
-		  [_fn] "X" (fn) : CALL_CLOBBER_##nr);			\
-	r2;								\
+		  [_fn] "X" (__fn) : CALL_CLOBBER_##nr);		\
+	(rettype)r2;							\
 })
 
-#define CALL_ON_STACK_NORETURN(fn, stack)				\
+#define call_on_stack_noreturn(fn, stack)				\
 ({									\
+	void (*__fn)(void) = fn;					\
+									\
 	asm volatile(							\
 		"	la	15,0(%[_stack])\n"			\
 		"	xc	%[_bc](8,15),%[_bc](15)\n"		\
 		"	brasl	14,%[_fn]\n"				\
 		::[_bc] "i" (offsetof(struct stack_frame, back_chain)),	\
-		  [_stack] "a" (stack), [_fn] "X" (fn));		\
+		  [_stack] "a" (stack), [_fn] "X" (__fn));		\
 	BUG();								\
 })
 
diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h
index 7b98d4c..12c5f006 100644
--- a/arch/s390/include/asm/uv.h
+++ b/arch/s390/include/asm/uv.h
@@ -73,6 +73,10 @@
 	BIT_UVC_CMD_UNPIN_PAGE_SHARED = 22,
 };
 
+enum uv_feat_ind {
+	BIT_UV_FEAT_MISC = 0,
+};
+
 struct uv_cb_header {
 	u16 len;
 	u16 cmd;	/* Command Code */
@@ -97,7 +101,8 @@
 	u64 max_guest_stor_addr;
 	u8  reserved88[158 - 136];
 	u16 max_guest_cpu_id;
-	u8  reserveda0[200 - 160];
+	u64 uv_feature_indications;
+	u8  reserveda0[200 - 168];
 } __packed __aligned(8);
 
 /* Initialize Ultravisor */
@@ -274,6 +279,7 @@
 	unsigned long max_sec_stor_addr;
 	unsigned int max_num_sec_conf;
 	unsigned short max_guest_cpu_id;
+	unsigned long uv_feature_indications;
 };
 
 extern struct uv_info uv_info;
diff --git a/arch/s390/include/asm/vdso.h b/arch/s390/include/asm/vdso.h
index b45e3dd..53165aa 100644
--- a/arch/s390/include/asm/vdso.h
+++ b/arch/s390/include/asm/vdso.h
@@ -4,18 +4,31 @@
 
 #include <vdso/datapage.h>
 
-/* Default link address for the vDSO */
-#define VDSO64_LBASE	0
-
-#define __VVAR_PAGES	2
-
-#define VDSO_VERSION_STRING	LINUX_2.6.29
-
 #ifndef __ASSEMBLY__
 
+#include <generated/vdso64-offsets.h>
+#ifdef CONFIG_COMPAT
+#include <generated/vdso32-offsets.h>
+#endif
+
+#define VDSO64_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso64_offset_##name))
+#ifdef CONFIG_COMPAT
+#define VDSO32_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso32_offset_##name))
+#else
+#define VDSO32_SYMBOL(tsk, name) (-1UL)
+#endif
+
 extern struct vdso_data *vdso_data;
 
 int vdso_getcpu_init(void);
 
 #endif /* __ASSEMBLY__ */
+
+/* Default link address for the vDSO */
+#define VDSO_LBASE	0
+
+#define __VVAR_PAGES	2
+
+#define VDSO_VERSION_STRING	LINUX_2.6.29
+
 #endif /* __S390_VDSO_H__ */
diff --git a/arch/s390/include/asm/vdso/gettimeofday.h b/arch/s390/include/asm/vdso/gettimeofday.h
index 383c53c..d6465b2 100644
--- a/arch/s390/include/asm/vdso/gettimeofday.h
+++ b/arch/s390/include/asm/vdso/gettimeofday.h
@@ -8,7 +8,6 @@
 
 #include <asm/timex.h>
 #include <asm/unistd.h>
-#include <asm/vdso.h>
 #include <linux/compiler.h>
 
 #define vdso_calc_delta __arch_vdso_calc_delta
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 68ca183..4a44ba5 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -71,10 +71,10 @@
 obj-$(CONFIG_PERF_EVENTS)	+= perf_event.o perf_cpum_cf_common.o
 obj-$(CONFIG_PERF_EVENTS)	+= perf_cpum_cf.o perf_cpum_sf.o
 obj-$(CONFIG_PERF_EVENTS)	+= perf_cpum_cf_events.o perf_regs.o
-obj-$(CONFIG_PERF_EVENTS)	+= perf_cpum_cf_diag.o
 
 obj-$(CONFIG_TRACEPOINTS)	+= trace.o
 obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE))	+= uv.o
 
 # vdso
 obj-y				+= vdso64/
+obj-$(CONFIG_COMPAT)		+= vdso32/
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index f53605a..77ff213 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -14,8 +14,6 @@
 #include <linux/pgtable.h>
 #include <asm/idle.h>
 #include <asm/gmap.h>
-#include <asm/nmi.h>
-#include <asm/setup.h>
 #include <asm/stacktrace.h>
 
 int main(void)
@@ -108,7 +106,6 @@
 	OFFSET(__LC_LAST_UPDATE_CLOCK, lowcore, last_update_clock);
 	OFFSET(__LC_INT_CLOCK, lowcore, int_clock);
 	OFFSET(__LC_MCCK_CLOCK, lowcore, mcck_clock);
-	OFFSET(__LC_CLOCK_COMPARATOR, lowcore, clock_comparator);
 	OFFSET(__LC_BOOT_CLOCK, lowcore, boot_clock);
 	OFFSET(__LC_CURRENT, lowcore, current_task);
 	OFFSET(__LC_KERNEL_STACK, lowcore, kernel_stack);
@@ -145,9 +142,6 @@
 	OFFSET(__LC_CREGS_SAVE_AREA, lowcore, cregs_save_area);
 	OFFSET(__LC_PGM_TDB, lowcore, pgm_tdb);
 	BLANK();
-	/* extended machine check save area */
-	OFFSET(__MCESA_GS_SAVE_AREA, mcesa, guarded_storage_save_area);
-	BLANK();
 	/* gmap/sie offsets */
 	OFFSET(__GMAP_ASCE, gmap, asce);
 	OFFSET(__SIE_PROG0C, kvm_s390_sie_block, prog0c);
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index 1d0e17e..cca142f 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -28,6 +28,7 @@
 #include <linux/uaccess.h>
 #include <asm/lowcore.h>
 #include <asm/switch_to.h>
+#include <asm/vdso.h>
 #include "compat_linux.h"
 #include "compat_ptrace.h"
 #include "entry.h"
@@ -118,7 +119,6 @@
 	fpregs_load((_s390_fp_regs *) &user_sregs.fpregs, &current->thread.fpu);
 
 	clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */
-	clear_pt_regs_flag(regs, PIF_SYSCALL_RESTART);
 	return 0;
 }
 
@@ -304,11 +304,7 @@
 		restorer = (unsigned long __force)
 			ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE;
 	} else {
-		/* Signal frames without vectors registers are short ! */
-		__u16 __user *svc = (void __user *) frame + frame_size - 2;
-		if (__put_user(S390_SYSCALL_OPCODE | __NR_sigreturn, svc))
-			return -EFAULT;
-		restorer = (unsigned long __force) svc | PSW32_ADDR_AMODE;
+		restorer = VDSO32_SYMBOL(current, sigreturn);
         }
 
 	/* Set up registers for signal handler */
@@ -371,10 +367,7 @@
 		restorer = (unsigned long __force)
 			ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE;
 	} else {
-		__u16 __user *svc = &frame->svc_insn;
-		if (__put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, svc))
-			return -EFAULT;
-		restorer = (unsigned long __force) svc | PSW32_ADDR_AMODE;
+		restorer = VDSO32_SYMBOL(current, rt_sigreturn);
 	}
 
 	/* Create siginfo on the signal stack */
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index c2cf79d..fb84e3f 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -33,6 +33,8 @@
 #include <asm/switch_to.h>
 #include "entry.h"
 
+int __bootdata(is_full_image);
+
 static void __init reset_tod_clock(void)
 {
 	union tod_clock clk;
@@ -279,7 +281,7 @@
 
 static void __init check_image_bootable(void)
 {
-	if (!memcmp(EP_STRING, (void *)EP_OFFSET, strlen(EP_STRING)))
+	if (is_full_image)
 		return;
 
 	sclp_early_printk("Linux kernel boot failure: An attempt to boot a vmlinux ELF image failed.\n");
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 3e8c666..5a2f70c 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -14,7 +14,6 @@
 #include <asm/alternative-asm.h>
 #include <asm/processor.h>
 #include <asm/cache.h>
-#include <asm/ctl_reg.h>
 #include <asm/dwarf.h>
 #include <asm/errno.h>
 #include <asm/ptrace.h>
@@ -129,6 +128,24 @@
 		    "jnz .+8; .long 0xb2e8d000", 82
 	.endm
 
+	/*
+	 * The CHKSTG macro jumps to the provided label in case the
+	 * machine check interruption code reports one of unrecoverable
+	 * storage errors:
+	 * - Storage error uncorrected
+	 * - Storage key error uncorrected
+	 * - Storage degradation with Failing-storage-address validity
+	 */
+	.macro CHKSTG errlabel
+	TSTMSK	__LC_MCCK_CODE,(MCCK_CODE_STG_ERROR|MCCK_CODE_STG_KEY_ERROR)
+	jnz	\errlabel
+	TSTMSK	__LC_MCCK_CODE,MCCK_CODE_STG_DEGRAD
+	jz	oklabel\@
+	TSTMSK	__LC_MCCK_CODE,MCCK_CODE_STG_FAIL_ADDR
+	jnz	\errlabel
+oklabel\@:
+	.endm
+
 #if IS_ENABLED(CONFIG_KVM)
 	/*
 	 * The OUTSIDE macro jumps to the provided label in case the value
@@ -148,6 +165,13 @@
 	clgr	%r14,%r13
 	jhe	\outside_label
 	.endm
+
+	.macro SIEEXIT
+	lg	%r9,__SF_SIE_CONTROL(%r15)	# get control block pointer
+	ni	__SIE_PROG0C+3(%r9),0xfe	# no longer in SIE
+	lctlg	%c1,%c1,__LC_KERNEL_ASCE	# load primary asce
+	larl	%r9,sie_exit			# skip forward to sie_exit
+	.endm
 #endif
 
 	GEN_BR_THUNK %r14
@@ -235,7 +259,6 @@
 # are some corner cases (e.g. runtime instrumentation) where ILC is unpredictable.
 # Other instructions between sie64a and .Lsie_done should not cause program
 # interrupts. So lets use 3 nops as a landing pad for all possible rewinds.
-# See also .Lcleanup_sie
 .Lrewind_pad6:
 	nopr	7
 .Lrewind_pad4:
@@ -341,10 +364,7 @@
 #if IS_ENABLED(CONFIG_KVM)
 	# cleanup critical section for program checks in sie64a
 	OUTSIDE	%r9,.Lsie_gmap,.Lsie_done,1f
-	lg	%r14,__SF_SIE_CONTROL(%r15)	# get control block pointer
-	ni	__SIE_PROG0C+3(%r14),0xfe	# no longer in SIE
-	lctlg	%c1,%c1,__LC_KERNEL_ASCE	# load primary asce
-	larl	%r9,sie_exit			# skip forward to sie_exit
+	SIEEXIT
 	lghi	%r10,_PIF_GUEST_FAULT
 #endif
 1:	tmhh	%r8,0x4000		# PER bit set in old PSW ?
@@ -410,7 +430,8 @@
 	jnz	1f
 #if IS_ENABLED(CONFIG_KVM)
 	OUTSIDE	%r9,.Lsie_gmap,.Lsie_done,0f
-	brasl	%r14,.Lcleanup_sie
+	BPENTER	__SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
+	SIEEXIT
 #endif
 0:	CHECK_STACK __LC_SAVE_AREA_ASYNC
 	aghi	%r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
@@ -484,8 +505,6 @@
 	BPOFF
 	la	%r1,4095		# validate r1
 	spt	__LC_CPU_TIMER_SAVE_AREA-4095(%r1)	# validate cpu timer
-	sckc	__LC_CLOCK_COMPARATOR			# validate comparator
-	lam	%a0,%a15,__LC_AREGS_SAVE_AREA-4095(%r1) # validate acrs
 	lmg	%r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# validate gprs
 	lg	%r12,__LC_CURRENT
 	lmg	%r8,%r9,__LC_MCK_OLD_PSW
@@ -496,41 +515,7 @@
 	la	%r14,4095
 	lctlg	%c0,%c15,__LC_CREGS_SAVE_AREA-4095(%r14) # validate ctl regs
 	ptlb
-	lg	%r11,__LC_MCESAD-4095(%r14) # extended machine check save area
-	nill	%r11,0xfc00		# MCESA_ORIGIN_MASK
-	TSTMSK	__LC_CREGS_SAVE_AREA+16-4095(%r14),CR2_GUARDED_STORAGE
-	jno	0f
-	TSTMSK	__LC_MCCK_CODE,MCCK_CODE_GS_VALID
-	jno	0f
-	.insn	 rxy,0xe3000000004d,0,__MCESA_GS_SAVE_AREA(%r11) # LGSC
-0:	l	%r14,__LC_FP_CREG_SAVE_AREA-4095(%r14)
-	TSTMSK	__LC_MCCK_CODE,MCCK_CODE_FC_VALID
-	jo	0f
-	sr	%r14,%r14
-0:	sfpc	%r14
-	TSTMSK	__LC_MACHINE_FLAGS,MACHINE_FLAG_VX
-	jo	0f
-	lghi	%r14,__LC_FPREGS_SAVE_AREA
-	ld	%f0,0(%r14)
-	ld	%f1,8(%r14)
-	ld	%f2,16(%r14)
-	ld	%f3,24(%r14)
-	ld	%f4,32(%r14)
-	ld	%f5,40(%r14)
-	ld	%f6,48(%r14)
-	ld	%f7,56(%r14)
-	ld	%f8,64(%r14)
-	ld	%f9,72(%r14)
-	ld	%f10,80(%r14)
-	ld	%f11,88(%r14)
-	ld	%f12,96(%r14)
-	ld	%f13,104(%r14)
-	ld	%f14,112(%r14)
-	ld	%f15,120(%r14)
-	j	1f
-0:	VLM	%v0,%v15,0,%r11
-	VLM	%v16,%v31,256,%r11
-1:	lghi	%r14,__LC_CPU_TIMER_SAVE_AREA
+	lghi	%r14,__LC_CPU_TIMER_SAVE_AREA
 	mvc	__LC_MCCK_ENTER_TIMER(8),0(%r14)
 	TSTMSK	__LC_MCCK_CODE,MCCK_CODE_CPU_TIMER_VALID
 	jo	3f
@@ -546,24 +531,29 @@
 3:	TSTMSK	__LC_MCCK_CODE,MCCK_CODE_PSW_MWP_VALID
 	jno	.Lmcck_panic
 	tmhh	%r8,0x0001		# interrupting from user ?
-	jnz	4f
+	jnz	6f
 	TSTMSK	__LC_MCCK_CODE,MCCK_CODE_PSW_IA_VALID
 	jno	.Lmcck_panic
-4:	ssm	__LC_PGM_NEW_PSW	# turn dat on, keep irqs off
-	tmhh	%r8,0x0001			# interrupting from user ?
-	jnz	.Lmcck_user
 #if IS_ENABLED(CONFIG_KVM)
-	OUTSIDE	%r9,.Lsie_gmap,.Lsie_done,.Lmcck_stack
-	OUTSIDE	%r9,.Lsie_entry,.Lsie_skip,5f
+	OUTSIDE	%r9,.Lsie_gmap,.Lsie_done,6f
+	OUTSIDE	%r9,.Lsie_entry,.Lsie_skip,4f
 	oi	__LC_CPU_FLAGS+7, _CIF_MCCK_GUEST
-5:	brasl	%r14,.Lcleanup_sie
-#endif
+	j	5f
+4:	CHKSTG	.Lmcck_panic
+5:	larl	%r14,.Lstosm_tmp
+	stosm	0(%r14),0x04		# turn dat on, keep irqs off
+	BPENTER	__SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
+	SIEEXIT
 	j	.Lmcck_stack
-.Lmcck_user:
+#endif
+6:	CHKSTG	.Lmcck_panic
+	larl	%r14,.Lstosm_tmp
+	stosm	0(%r14),0x04		# turn dat on, keep irqs off
+	tmhh	%r8,0x0001		# interrupting from user ?
+	jz	.Lmcck_stack
 	BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
 .Lmcck_stack:
 	lg	%r15,__LC_MCCK_STACK
-.Lmcck_skip:
 	la	%r11,STACK_FRAME_OVERHEAD(%r15)
 	stctg	%c1,%c1,__PT_CR1(%r11)
 	lctlg	%c1,%c1,__LC_KERNEL_ASCE
@@ -605,8 +595,33 @@
 	b	__LC_RETURN_MCCK_LPSWE
 
 .Lmcck_panic:
-	lg	%r15,__LC_NODAT_STACK
-	j	.Lmcck_skip
+	/*
+	 * Iterate over all possible CPU addresses in the range 0..0xffff
+	 * and stop each CPU using signal processor. Use compare and swap
+	 * to allow just one CPU-stopper and prevent concurrent CPUs from
+	 * stopping each other while leaving the others running.
+	 */
+	lhi	%r5,0
+	lhi	%r6,1
+	larl	%r7,.Lstop_lock
+	cs	%r5,%r6,0(%r7)		# single CPU-stopper only
+	jnz	4f
+	larl	%r7,.Lthis_cpu
+	stap	0(%r7)			# this CPU address
+	lh	%r4,0(%r7)
+	nilh	%r4,0
+	lhi	%r0,1
+	sll	%r0,16			# CPU counter
+	lhi	%r3,0			# next CPU address
+0:	cr	%r3,%r4
+	je	2f
+1:	sigp	%r1,%r3,SIGP_STOP	# stop next CPU
+	brc	SIGP_CC_BUSY,1b
+2:	ahi	%r3,1
+	brct	%r0,0b
+3:	sigp	%r1,%r4,SIGP_STOP	# stop this CPU
+	brc	SIGP_CC_BUSY,3b
+4:	j	4b
 ENDPROC(mcck_int_handler)
 
 #
@@ -657,15 +672,11 @@
 ENDPROC(stack_overflow)
 #endif
 
-#if IS_ENABLED(CONFIG_KVM)
-.Lcleanup_sie:
-	BPENTER	__SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
-	lg	%r9,__SF_SIE_CONTROL(%r15)	# get control block pointer
-	ni	__SIE_PROG0C+3(%r9),0xfe	# no longer in SIE
-	lctlg	%c1,%c1,__LC_KERNEL_ASCE
-	larl	%r9,sie_exit			# skip forward to sie_exit
-	BR_EX	%r14,%r13
-#endif
+	.section .data, "aw"
+		.align	4
+.Lstop_lock:	.long	0
+.Lthis_cpu:	.short	0
+.Lstosm_tmp:	.byte	0
 	.section .rodata, "a"
 #define SYSCALL(esame,emu)	.quad __s390x_ ## esame
 	.globl	sys_call_table
diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
index c0df406..234d085 100644
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -110,15 +110,17 @@
 {
 	unsigned long frame = current_frame_address();
 
-	return !!!((S390_lowcore.async_stack - frame) >> (PAGE_SHIFT + THREAD_SIZE_ORDER));
+	return ((S390_lowcore.async_stack ^ frame) & ~(THREAD_SIZE - 1)) == 0;
 }
 
 static void do_irq_async(struct pt_regs *regs, int irq)
 {
-	if (on_async_stack())
+	if (on_async_stack()) {
 		do_IRQ(regs, irq);
-	else
-		CALL_ON_STACK(do_IRQ, S390_lowcore.async_stack, 2, regs, irq);
+	} else {
+		call_on_stack(2, S390_lowcore.async_stack, void, do_IRQ,
+			      struct pt_regs *, regs, int, irq);
+	}
 }
 
 static int irq_pending(struct pt_regs *regs)
@@ -266,24 +268,6 @@
 }
 
 /*
- * Switch to the asynchronous interrupt stack for softirq execution.
- */
-void do_softirq_own_stack(void)
-{
-	unsigned long old, new;
-
-	old = current_stack_pointer();
-	/* Check against async. stack address range. */
-	new = S390_lowcore.async_stack;
-	if (((new - old) >> (PAGE_SHIFT + THREAD_SIZE_ORDER)) != 0) {
-		CALL_ON_STACK(__do_softirq, new, 0);
-	} else {
-		/* We are already on the async stack. */
-		__do_softirq();
-	}
-}
-
-/*
  * ext_int_hash[index] is the list head for all external interrupts that hash
  * to this index.
  */
diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c
index 528bb31..52d056a 100644
--- a/arch/s390/kernel/kprobes.c
+++ b/arch/s390/kernel/kprobes.c
@@ -92,11 +92,6 @@
 }
 NOKPROBE_SYMBOL(copy_instruction);
 
-static inline int is_kernel_addr(void *addr)
-{
-	return addr < (void *)_end;
-}
-
 static int s390_get_insn_slot(struct kprobe *p)
 {
 	/*
@@ -105,7 +100,7 @@
 	 * field can be patched and executed within the insn slot.
 	 */
 	p->ainsn.insn = NULL;
-	if (is_kernel_addr(p->addr))
+	if (is_kernel((unsigned long)p->addr))
 		p->ainsn.insn = get_s390_insn_slot();
 	else if (is_module_addr(p->addr))
 		p->ainsn.insn = get_insn_slot();
@@ -117,7 +112,7 @@
 {
 	if (!p->ainsn.insn)
 		return;
-	if (is_kernel_addr(p->addr))
+	if (is_kernel((unsigned long)p->addr))
 		free_s390_insn_slot(p->ainsn.insn, 0);
 	else
 		free_insn_slot(p->ainsn.insn, 0);
diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c
index d91989c..1005a69 100644
--- a/arch/s390/kernel/machine_kexec.c
+++ b/arch/s390/kernel/machine_kexec.c
@@ -132,7 +132,8 @@
 	int rc;
 
 	preempt_disable();
-	rc = CALL_ON_STACK(do_start_kdump, S390_lowcore.nodat_stack, 1, image);
+	rc = call_on_stack(1, S390_lowcore.nodat_stack, unsigned long, do_start_kdump,
+			   unsigned long, (unsigned long)image);
 	preempt_enable();
 	return rc == 0;
 #else
diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c
index 11f8c29..20f8e18 100644
--- a/arch/s390/kernel/nmi.c
+++ b/arch/s390/kernel/nmi.c
@@ -189,12 +189,16 @@
  * returns 0 if all required registers are available
  * returns 1 otherwise
  */
-static int notrace s390_check_registers(union mci mci, int umode)
+static int notrace s390_validate_registers(union mci mci, int umode)
 {
+	struct mcesa *mcesa;
+	void *fpt_save_area;
 	union ctlreg2 cr2;
 	int kill_task;
+	u64 zero;
 
 	kill_task = 0;
+	zero = 0;
 
 	if (!mci.gr) {
 		/*
@@ -205,14 +209,6 @@
 			s390_handle_damage();
 		kill_task = 1;
 	}
-	/* Check control registers */
-	if (!mci.cr) {
-		/*
-		 * Control registers have unknown contents.
-		 * Can't recover and therefore stopping machine.
-		 */
-		s390_handle_damage();
-	}
 	if (!mci.fp) {
 		/*
 		 * Floating point registers can't be restored. If the
@@ -225,35 +221,89 @@
 		if (!test_cpu_flag(CIF_FPU))
 			kill_task = 1;
 	}
+	fpt_save_area = &S390_lowcore.floating_pt_save_area;
 	if (!mci.fc) {
 		/*
 		 * Floating point control register can't be restored.
 		 * If the kernel currently uses the floating pointer
 		 * registers and needs the FPC register the system is
 		 * stopped. If the process has its floating pointer
-		 * registers loaded it is terminated.
+		 * registers loaded it is terminated. Otherwise the
+		 * FPC is just validated.
 		 */
 		if (S390_lowcore.fpu_flags & KERNEL_FPC)
 			s390_handle_damage();
+		asm volatile(
+			"	lfpc	%0\n"
+			:
+			: "Q" (zero));
 		if (!test_cpu_flag(CIF_FPU))
 			kill_task = 1;
+	} else {
+		asm volatile(
+			"	lfpc	%0\n"
+			:
+			: "Q" (S390_lowcore.fpt_creg_save_area));
 	}
 
-	if (MACHINE_HAS_VX) {
+	mcesa = (struct mcesa *)(S390_lowcore.mcesad & MCESA_ORIGIN_MASK);
+	if (!MACHINE_HAS_VX) {
+		/* Validate floating point registers */
+		asm volatile(
+			"	ld	0,0(%0)\n"
+			"	ld	1,8(%0)\n"
+			"	ld	2,16(%0)\n"
+			"	ld	3,24(%0)\n"
+			"	ld	4,32(%0)\n"
+			"	ld	5,40(%0)\n"
+			"	ld	6,48(%0)\n"
+			"	ld	7,56(%0)\n"
+			"	ld	8,64(%0)\n"
+			"	ld	9,72(%0)\n"
+			"	ld	10,80(%0)\n"
+			"	ld	11,88(%0)\n"
+			"	ld	12,96(%0)\n"
+			"	ld	13,104(%0)\n"
+			"	ld	14,112(%0)\n"
+			"	ld	15,120(%0)\n"
+			:
+			: "a" (fpt_save_area)
+			: "memory");
+	} else {
+		/* Validate vector registers */
+		union ctlreg0 cr0;
+
 		if (!mci.vr) {
 			/*
 			 * Vector registers can't be restored. If the kernel
 			 * currently uses vector registers the system is
 			 * stopped. If the process has its vector registers
-			 * loaded it is terminated.
+			 * loaded it is terminated. Otherwise just validate
+			 * the registers.
 			 */
 			if (S390_lowcore.fpu_flags & KERNEL_VXR)
 				s390_handle_damage();
 			if (!test_cpu_flag(CIF_FPU))
 				kill_task = 1;
 		}
+		cr0.val = S390_lowcore.cregs_save_area[0];
+		cr0.afp = cr0.vx = 1;
+		__ctl_load(cr0.val, 0, 0);
+		asm volatile(
+			"	la	1,%0\n"
+			"	.word	0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */
+			"	.word	0xe70f,0x1100,0x0c36\n" /* vlm 16,31,256(1) */
+			:
+			: "Q" (*(struct vx_array *)mcesa->vector_save_area)
+			: "1");
+		__ctl_load(S390_lowcore.cregs_save_area[0], 0, 0);
 	}
-	/* Check if access registers are valid */
+	/* Validate access registers */
+	asm volatile(
+		"	lam	0,15,0(%0)\n"
+		:
+		: "a" (&S390_lowcore.access_regs_save_area)
+		: "memory");
 	if (!mci.ar) {
 		/*
 		 * Access registers have unknown contents.
@@ -261,7 +311,7 @@
 		 */
 		kill_task = 1;
 	}
-	/* Check guarded storage registers */
+	/* Validate guarded storage registers */
 	cr2.val = S390_lowcore.cregs_save_area[2];
 	if (cr2.gse) {
 		if (!mci.gs) {
@@ -271,31 +321,26 @@
 			 * It has to be terminated.
 			 */
 			kill_task = 1;
+		} else {
+			load_gs_cb((struct gs_cb *)mcesa->guarded_storage_save_area);
 		}
 	}
-	/* Check if old PSW is valid */
-	if (!mci.wp) {
-		/*
-		 * Can't tell if we come from user or kernel mode
-		 * -> stopping machine.
-		 */
-		s390_handle_damage();
-	}
-	/* Check for invalid kernel instruction address */
-	if (!mci.ia && !umode) {
-		/*
-		 * The instruction address got lost while running
-		 * in the kernel -> stopping machine.
-		 */
-		s390_handle_damage();
-	}
+	/*
+	 * The getcpu vdso syscall reads CPU number from the programmable
+	 * field of the TOD clock. Disregard the TOD programmable register
+	 * validity bit and load the CPU number into the TOD programmable
+	 * field unconditionally.
+	 */
+	set_tod_programmable_field(raw_smp_processor_id());
+	/* Validate clock comparator register */
+	set_clock_comparator(S390_lowcore.clock_comparator);
 
 	if (!mci.ms || !mci.pm || !mci.ia)
 		kill_task = 1;
 
 	return kill_task;
 }
-NOKPROBE_SYMBOL(s390_check_registers);
+NOKPROBE_SYMBOL(s390_validate_registers);
 
 /*
  * Backup the guest's machine check info to its description block
@@ -353,11 +398,6 @@
 	mci.val = S390_lowcore.mcck_interruption_code;
 	mcck = this_cpu_ptr(&cpu_mcck);
 
-	if (mci.sd) {
-		/* System damage -> stopping machine */
-		s390_handle_damage();
-	}
-
 	/*
 	 * Reinject the instruction processing damages' machine checks
 	 * including Delayed Access Exception into the guest
@@ -398,7 +438,7 @@
 			s390_handle_damage();
 		}
 	}
-	if (s390_check_registers(mci, user_mode(regs))) {
+	if (s390_validate_registers(mci, user_mode(regs))) {
 		/*
 		 * Couldn't restore all register contents for the
 		 * user space process -> mark task for termination.
@@ -428,21 +468,6 @@
 		mcck_pending = 1;
 	}
 
-	/*
-	 * Reinject storage related machine checks into the guest if they
-	 * happen when the guest is running.
-	 */
-	if (!test_cpu_flag(CIF_MCCK_GUEST)) {
-		if (mci.se)
-			/* Storage error uncorrected */
-			s390_handle_damage();
-		if (mci.ke)
-			/* Storage key-error uncorrected */
-			s390_handle_damage();
-		if (mci.ds && mci.fa)
-			/* Storage degradation */
-			s390_handle_damage();
-	}
 	if (mci.cp) {
 		/* Channel report word pending */
 		mcck->channel_report = 1;
diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
index 1b7a052..975a00c 100644
--- a/arch/s390/kernel/perf_cpum_cf.c
+++ b/arch/s390/kernel/perf_cpum_cf.c
@@ -2,8 +2,9 @@
 /*
  * Performance event support for s390x - CPU-measurement Counter Facility
  *
- *  Copyright IBM Corp. 2012, 2019
+ *  Copyright IBM Corp. 2012, 2021
  *  Author(s): Hendrik Brueckner <brueckner@linux.ibm.com>
+ *	       Thomas Richter <tmricht@linux.ibm.com>
  */
 #define KMSG_COMPONENT	"cpum_cf"
 #define pr_fmt(fmt)	KMSG_COMPONENT ": " fmt
@@ -14,7 +15,223 @@
 #include <linux/notifier.h>
 #include <linux/init.h>
 #include <linux/export.h>
+#include <linux/miscdevice.h>
+
 #include <asm/cpu_mcf.h>
+#include <asm/hwctrset.h>
+#include <asm/debug.h>
+
+static unsigned int cfdiag_cpu_speed;	/* CPU speed for CF_DIAG trailer */
+static debug_info_t *cf_dbg;
+
+#define	CF_DIAG_CTRSET_DEF		0xfeef	/* Counter set header mark */
+						/* interval in seconds */
+
+/* Counter sets are stored as data stream in a page sized memory buffer and
+ * exported to user space via raw data attached to the event sample data.
+ * Each counter set starts with an eight byte header consisting of:
+ * - a two byte eye catcher (0xfeef)
+ * - a one byte counter set number
+ * - a two byte counter set size (indicates the number of counters in this set)
+ * - a three byte reserved value (must be zero) to make the header the same
+ *   size as a counter value.
+ * All counter values are eight byte in size.
+ *
+ * All counter sets are followed by a 64 byte trailer.
+ * The trailer consists of a:
+ * - flag field indicating valid fields when corresponding bit set
+ * - the counter facility first and second version number
+ * - the CPU speed if nonzero
+ * - the time stamp the counter sets have been collected
+ * - the time of day (TOD) base value
+ * - the machine type.
+ *
+ * The counter sets are saved when the process is prepared to be executed on a
+ * CPU and saved again when the process is going to be removed from a CPU.
+ * The difference of both counter sets are calculated and stored in the event
+ * sample data area.
+ */
+struct cf_ctrset_entry {	/* CPU-M CF counter set entry (8 byte) */
+	unsigned int def:16;	/* 0-15  Data Entry Format */
+	unsigned int set:16;	/* 16-31 Counter set identifier */
+	unsigned int ctr:16;	/* 32-47 Number of stored counters */
+	unsigned int res1:16;	/* 48-63 Reserved */
+};
+
+struct cf_trailer_entry {	/* CPU-M CF_DIAG trailer (64 byte) */
+	/* 0 - 7 */
+	union {
+		struct {
+			unsigned int clock_base:1;	/* TOD clock base set */
+			unsigned int speed:1;		/* CPU speed set */
+			/* Measurement alerts */
+			unsigned int mtda:1;	/* Loss of MT ctr. data alert */
+			unsigned int caca:1;	/* Counter auth. change alert */
+			unsigned int lcda:1;	/* Loss of counter data alert */
+		};
+		unsigned long flags;	/* 0-63    All indicators */
+	};
+	/* 8 - 15 */
+	unsigned int cfvn:16;			/* 64-79   Ctr First Version */
+	unsigned int csvn:16;			/* 80-95   Ctr Second Version */
+	unsigned int cpu_speed:32;		/* 96-127  CPU speed */
+	/* 16 - 23 */
+	unsigned long timestamp;		/* 128-191 Timestamp (TOD) */
+	/* 24 - 55 */
+	union {
+		struct {
+			unsigned long progusage1;
+			unsigned long progusage2;
+			unsigned long progusage3;
+			unsigned long tod_base;
+		};
+		unsigned long progusage[4];
+	};
+	/* 56 - 63 */
+	unsigned int mach_type:16;		/* Machine type */
+	unsigned int res1:16;			/* Reserved */
+	unsigned int res2:32;			/* Reserved */
+};
+
+/* Create the trailer data at the end of a page. */
+static void cfdiag_trailer(struct cf_trailer_entry *te)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	struct cpuid cpuid;
+
+	te->cfvn = cpuhw->info.cfvn;		/* Counter version numbers */
+	te->csvn = cpuhw->info.csvn;
+
+	get_cpu_id(&cpuid);			/* Machine type */
+	te->mach_type = cpuid.machine;
+	te->cpu_speed = cfdiag_cpu_speed;
+	if (te->cpu_speed)
+		te->speed = 1;
+	te->clock_base = 1;			/* Save clock base */
+	te->tod_base = tod_clock_base.tod;
+	te->timestamp = get_tod_clock_fast();
+}
+
+/* Read a counter set. The counter set number determines the counter set and
+ * the CPUM-CF first and second version number determine the number of
+ * available counters in each counter set.
+ * Each counter set starts with header containing the counter set number and
+ * the number of eight byte counters.
+ *
+ * The functions returns the number of bytes occupied by this counter set
+ * including the header.
+ * If there is no counter in the counter set, this counter set is useless and
+ * zero is returned on this case.
+ *
+ * Note that the counter sets may not be enabled or active and the stcctm
+ * instruction might return error 3. Depending on error_ok value this is ok,
+ * for example when called from cpumf_pmu_start() call back function.
+ */
+static size_t cfdiag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset,
+			       size_t room, bool error_ok)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	size_t ctrset_size, need = 0;
+	int rc = 3;				/* Assume write failure */
+
+	ctrdata->def = CF_DIAG_CTRSET_DEF;
+	ctrdata->set = ctrset;
+	ctrdata->res1 = 0;
+	ctrset_size = cpum_cf_ctrset_size(ctrset, &cpuhw->info);
+
+	if (ctrset_size) {			/* Save data */
+		need = ctrset_size * sizeof(u64) + sizeof(*ctrdata);
+		if (need <= room) {
+			rc = ctr_stcctm(ctrset, ctrset_size,
+					(u64 *)(ctrdata + 1));
+		}
+		if (rc != 3 || error_ok)
+			ctrdata->ctr = ctrset_size;
+		else
+			need = 0;
+	}
+
+	debug_sprintf_event(cf_dbg, 3,
+			    "%s ctrset %d ctrset_size %zu cfvn %d csvn %d"
+			    " need %zd rc %d\n", __func__, ctrset, ctrset_size,
+			    cpuhw->info.cfvn, cpuhw->info.csvn, need, rc);
+	return need;
+}
+
+/* Read out all counter sets and save them in the provided data buffer.
+ * The last 64 byte host an artificial trailer entry.
+ */
+static size_t cfdiag_getctr(void *data, size_t sz, unsigned long auth,
+			    bool error_ok)
+{
+	struct cf_trailer_entry *trailer;
+	size_t offset = 0, done;
+	int i;
+
+	memset(data, 0, sz);
+	sz -= sizeof(*trailer);		/* Always room for trailer */
+	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+		struct cf_ctrset_entry *ctrdata = data + offset;
+
+		if (!(auth & cpumf_ctr_ctl[i]))
+			continue;	/* Counter set not authorized */
+
+		done = cfdiag_getctrset(ctrdata, i, sz - offset, error_ok);
+		offset += done;
+	}
+	trailer = data + offset;
+	cfdiag_trailer(trailer);
+	return offset + sizeof(*trailer);
+}
+
+/* Calculate the difference for each counter in a counter set. */
+static void cfdiag_diffctrset(u64 *pstart, u64 *pstop, int counters)
+{
+	for (; --counters >= 0; ++pstart, ++pstop)
+		if (*pstop >= *pstart)
+			*pstop -= *pstart;
+		else
+			*pstop = *pstart - *pstop + 1;
+}
+
+/* Scan the counter sets and calculate the difference of each counter
+ * in each set. The result is the increment of each counter during the
+ * period the counter set has been activated.
+ *
+ * Return true on success.
+ */
+static int cfdiag_diffctr(struct cpu_cf_events *cpuhw, unsigned long auth)
+{
+	struct cf_trailer_entry *trailer_start, *trailer_stop;
+	struct cf_ctrset_entry *ctrstart, *ctrstop;
+	size_t offset = 0;
+
+	auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1;
+	do {
+		ctrstart = (struct cf_ctrset_entry *)(cpuhw->start + offset);
+		ctrstop = (struct cf_ctrset_entry *)(cpuhw->stop + offset);
+
+		if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) {
+			pr_err_once("cpum_cf_diag counter set compare error "
+				    "in set %i\n", ctrstart->set);
+			return 0;
+		}
+		auth &= ~cpumf_ctr_ctl[ctrstart->set];
+		if (ctrstart->def == CF_DIAG_CTRSET_DEF) {
+			cfdiag_diffctrset((u64 *)(ctrstart + 1),
+					  (u64 *)(ctrstop + 1), ctrstart->ctr);
+			offset += ctrstart->ctr * sizeof(u64) +
+							sizeof(*ctrstart);
+		}
+	} while (ctrstart->def && auth);
+
+	/* Save time_stamp from start of event in stop's trailer */
+	trailer_start = (struct cf_trailer_entry *)(cpuhw->start + offset);
+	trailer_stop = (struct cf_trailer_entry *)(cpuhw->stop + offset);
+	trailer_stop->progusage[0] = trailer_start->timestamp;
+
+	return 1;
+}
 
 static enum cpumf_ctr_set get_counter_set(u64 event)
 {
@@ -34,7 +251,8 @@
 	return set;
 }
 
-static int validate_ctr_version(const struct hw_perf_event *hwc)
+static int validate_ctr_version(const struct hw_perf_event *hwc,
+				enum cpumf_ctr_set set)
 {
 	struct cpu_cf_events *cpuhw;
 	int err = 0;
@@ -43,7 +261,7 @@
 	cpuhw = &get_cpu_var(cpu_cf_events);
 
 	/* check required version for counter sets */
-	switch (hwc->config_base) {
+	switch (set) {
 	case CPUMF_CTR_SET_BASIC:
 	case CPUMF_CTR_SET_USER:
 		if (cpuhw->info.cfvn < 1)
@@ -86,6 +304,8 @@
 		      (cpuhw->info.act_ctl & mtdiag_ctl)))
 			err = -EOPNOTSUPP;
 		break;
+	case CPUMF_CTR_SET_MAX:
+		err = -EOPNOTSUPP;
 	}
 
 	put_cpu_var(cpu_cf_events);
@@ -95,7 +315,6 @@
 static int validate_ctr_auth(const struct hw_perf_event *hwc)
 {
 	struct cpu_cf_events *cpuhw;
-	u64 ctrs_state;
 	int err = 0;
 
 	cpuhw = &get_cpu_var(cpu_cf_events);
@@ -105,8 +324,7 @@
 	 * return with -ENOENT in order to fall back to other
 	 * PMUs that might suffice the event request.
 	 */
-	ctrs_state = cpumf_ctr_ctl[hwc->config_base];
-	if (!(ctrs_state & cpuhw->info.auth_ctl))
+	if (!(hwc->config_base & cpuhw->info.auth_ctl))
 		err = -ENOENT;
 
 	put_cpu_var(cpu_cf_events);
@@ -126,7 +344,7 @@
 	if (cpuhw->flags & PMU_F_ENABLED)
 		return;
 
-	err = lcctl(cpuhw->state);
+	err = lcctl(cpuhw->state | cpuhw->dev_state);
 	if (err) {
 		pr_err("Enabling the performance measuring unit "
 		       "failed with rc=%x\n", err);
@@ -151,6 +369,7 @@
 		return;
 
 	inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1);
+	inactive |= cpuhw->dev_state;
 	err = lcctl(inactive);
 	if (err) {
 		pr_err("Disabling the performance measuring unit "
@@ -199,6 +418,14 @@
 	[PERF_COUNT_HW_BUS_CYCLES]	    = -1,
 };
 
+static void cpumf_hw_inuse(void)
+{
+	mutex_lock(&pmc_reserve_mutex);
+	if (atomic_inc_return(&num_events) == 1)
+		__kernel_cpumcf_begin();
+	mutex_unlock(&pmc_reserve_mutex);
+}
+
 static int __hw_perf_event_init(struct perf_event *event, unsigned int type)
 {
 	struct perf_event_attr *attr = &event->attr;
@@ -258,11 +485,11 @@
 		/*
 		 * Use the hardware perf event structure to store the
 		 * counter number in the 'config' member and the counter
-		 * set number in the 'config_base'.  The counter set number
-		 * is then later used to enable/disable the counter(s).
+		 * set number in the 'config_base' as bit mask.
+		 * It is later used to enable/disable the counter(s).
 		 */
 		hwc->config = ev;
-		hwc->config_base = set;
+		hwc->config_base = cpumf_ctr_ctl[set];
 		break;
 	case CPUMF_CTR_SET_MAX:
 		/* The counter could not be associated to a counter set */
@@ -270,22 +497,13 @@
 	}
 
 	/* Initialize for using the CPU-measurement counter facility */
-	if (!atomic_inc_not_zero(&num_events)) {
-		mutex_lock(&pmc_reserve_mutex);
-		if (atomic_read(&num_events) == 0 && __kernel_cpumcf_begin())
-			err = -EBUSY;
-		else
-			atomic_inc(&num_events);
-		mutex_unlock(&pmc_reserve_mutex);
-	}
-	if (err)
-		return err;
+	cpumf_hw_inuse();
 	event->destroy = hw_perf_event_destroy;
 
 	/* Finally, validate version and authorization of the counter set */
 	err = validate_ctr_auth(hwc);
 	if (!err)
-		err = validate_ctr_version(hwc);
+		err = validate_ctr_version(hwc, set);
 
 	return err;
 }
@@ -361,6 +579,7 @@
 {
 	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
 	struct hw_perf_event *hwc = &event->hw;
+	int i;
 
 	if (!(hwc->state & PERF_HES_STOPPED))
 		return;
@@ -376,29 +595,92 @@
 	 * needs to be synchronized.  At this point, the counter set can be in
 	 * the inactive or disabled state.
 	 */
-	hw_perf_event_reset(event);
+	if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) {
+		cpuhw->usedss = cfdiag_getctr(cpuhw->start,
+					      sizeof(cpuhw->start),
+					      hwc->config_base, true);
+	} else {
+		hw_perf_event_reset(event);
+	}
 
-	/* increment refcount for this counter set */
-	atomic_inc(&cpuhw->ctr_set[hwc->config_base]);
+	/* Increment refcount for counter sets */
+	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i)
+		if ((hwc->config_base & cpumf_ctr_ctl[i]))
+			atomic_inc(&cpuhw->ctr_set[i]);
+}
+
+/* Create perf event sample with the counter sets as raw data.	The sample
+ * is then pushed to the event subsystem and the function checks for
+ * possible event overflows. If an event overflow occurs, the PMU is
+ * stopped.
+ *
+ * Return non-zero if an event overflow occurred.
+ */
+static int cfdiag_push_sample(struct perf_event *event,
+			      struct cpu_cf_events *cpuhw)
+{
+	struct perf_sample_data data;
+	struct perf_raw_record raw;
+	struct pt_regs regs;
+	int overflow;
+
+	/* Setup perf sample */
+	perf_sample_data_init(&data, 0, event->hw.last_period);
+	memset(&regs, 0, sizeof(regs));
+	memset(&raw, 0, sizeof(raw));
+
+	if (event->attr.sample_type & PERF_SAMPLE_CPU)
+		data.cpu_entry.cpu = event->cpu;
+	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+		raw.frag.size = cpuhw->usedss;
+		raw.frag.data = cpuhw->stop;
+		raw.size = raw.frag.size;
+		data.raw = &raw;
+	}
+
+	overflow = perf_event_overflow(event, &data, &regs);
+	debug_sprintf_event(cf_dbg, 3,
+			    "%s event %#llx sample_type %#llx raw %d ov %d\n",
+			    __func__, event->hw.config,
+			    event->attr.sample_type, raw.size, overflow);
+	if (overflow)
+		event->pmu->stop(event, 0);
+
+	perf_event_update_userpage(event);
+	return overflow;
 }
 
 static void cpumf_pmu_stop(struct perf_event *event, int flags)
 {
 	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
 	struct hw_perf_event *hwc = &event->hw;
+	int i;
 
 	if (!(hwc->state & PERF_HES_STOPPED)) {
 		/* Decrement reference count for this counter set and if this
 		 * is the last used counter in the set, clear activation
 		 * control and set the counter set state to inactive.
 		 */
-		if (!atomic_dec_return(&cpuhw->ctr_set[hwc->config_base]))
-			ctr_set_stop(&cpuhw->state, hwc->config_base);
+		for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+			if (!(hwc->config_base & cpumf_ctr_ctl[i]))
+				continue;
+			if (!atomic_dec_return(&cpuhw->ctr_set[i]))
+				ctr_set_stop(&cpuhw->state, cpumf_ctr_ctl[i]);
+		}
 		hwc->state |= PERF_HES_STOPPED;
 	}
 
 	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-		hw_perf_event_update(event);
+		if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) {
+			local64_inc(&event->count);
+			cpuhw->usedss = cfdiag_getctr(cpuhw->stop,
+						      sizeof(cpuhw->stop),
+						      event->hw.config_base,
+						      false);
+			if (cfdiag_diffctr(cpuhw, event->hw.config_base))
+				cfdiag_push_sample(event, cpuhw);
+		} else
+			hw_perf_event_update(event);
 		hwc->state |= PERF_HES_UPTODATE;
 	}
 }
@@ -419,6 +701,7 @@
 static void cpumf_pmu_del(struct perf_event *event, int flags)
 {
 	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	int i;
 
 	cpumf_pmu_stop(event, PERF_EF_UPDATE);
 
@@ -430,8 +713,9 @@
 	 * clear enable control and resets all counters in a set.  Therefore,
 	 * cpumf_pmu_start() always has to reenable a counter set.
 	 */
-	if (!atomic_read(&cpuhw->ctr_set[event->hw.config_base]))
-		ctr_set_disable(&cpuhw->state, event->hw.config_base);
+	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i)
+		if (!atomic_read(&cpuhw->ctr_set[i]))
+			ctr_set_disable(&cpuhw->state, cpumf_ctr_ctl[i]);
 }
 
 /* Performance monitoring unit for s390x */
@@ -448,6 +732,7 @@
 	.read	      = cpumf_pmu_read,
 };
 
+static int cfset_init(void);
 static int __init cpumf_pmu_init(void)
 {
 	int rc;
@@ -455,10 +740,689 @@
 	if (!kernel_cpumcf_avail())
 		return -ENODEV;
 
+	/* Setup s390dbf facility */
+	cf_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128);
+	if (!cf_dbg) {
+		pr_err("Registration of s390dbf(cpum_cf) failed\n");
+		return -ENOMEM;
+	};
+	debug_register_view(cf_dbg, &debug_sprintf_view);
+
 	cpumf_pmu.attr_groups = cpumf_cf_event_group();
 	rc = perf_pmu_register(&cpumf_pmu, "cpum_cf", -1);
-	if (rc)
+	if (rc) {
+		debug_unregister_view(cf_dbg, &debug_sprintf_view);
+		debug_unregister(cf_dbg);
 		pr_err("Registering the cpum_cf PMU failed with rc=%i\n", rc);
+	} else if (stccm_avail()) {	/* Setup counter set device */
+		cfset_init();
+	}
 	return rc;
 }
-subsys_initcall(cpumf_pmu_init);
+
+/* Support for the CPU Measurement Facility counter set extraction using
+ * device /dev/hwctr. This allows user space programs to extract complete
+ * counter set via normal file operations.
+ */
+
+static atomic_t cfset_opencnt = ATOMIC_INIT(0);	/* Excl. access */
+static DEFINE_MUTEX(cfset_ctrset_mutex);/* Synchronize access to hardware */
+struct cfset_call_on_cpu_parm {		/* Parm struct for smp_call_on_cpu */
+	unsigned int sets;		/* Counter set bit mask */
+	atomic_t cpus_ack;		/* # CPUs successfully executed func */
+};
+
+static struct cfset_request {		/* CPUs and counter set bit mask */
+	unsigned long ctrset;		/* Bit mask of counter set to read */
+	cpumask_t mask;			/* CPU mask to read from */
+} cfset_request;
+
+static void cfset_ctrset_clear(void)
+{
+	cpumask_clear(&cfset_request.mask);
+	cfset_request.ctrset = 0;
+}
+
+/* The /dev/hwctr device access uses PMU_F_IN_USE to mark the device access
+ * path is currently used.
+ * The cpu_cf_events::dev_state is used to denote counter sets in use by this
+ * interface. It is always or'ed in. If this interface is not active, its
+ * value is zero and no additional counter sets will be included.
+ *
+ * The cpu_cf_events::state is used by the perf_event_open SVC and remains
+ * unchanged.
+ *
+ * perf_pmu_enable() and perf_pmu_enable() and its call backs
+ * cpumf_pmu_enable() and  cpumf_pmu_disable() are called by the
+ * performance measurement subsystem to enable per process
+ * CPU Measurement counter facility.
+ * The XXX_enable() and XXX_disable functions are used to turn off
+ * x86 performance monitoring interrupt (PMI) during scheduling.
+ * s390 uses these calls to temporarily stop and resume the active CPU
+ * counters sets during scheduling.
+ *
+ * We do allow concurrent access of perf_event_open() SVC and /dev/hwctr
+ * device access.  The perf_event_open() SVC interface makes a lot of effort
+ * to only run the counters while the calling process is actively scheduled
+ * to run.
+ * When /dev/hwctr interface is also used at the same time, the counter sets
+ * will keep running, even when the process is scheduled off a CPU.
+ * However this is not a problem and does not lead to wrong counter values
+ * for the perf_event_open() SVC. The current counter value will be recorded
+ * during schedule-in. At schedule-out time the current counter value is
+ * extracted again and the delta is calculated and added to the event.
+ */
+/* Stop all counter sets via ioctl interface */
+static void cfset_ioctl_off(void *parm)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	struct cfset_call_on_cpu_parm *p = parm;
+	int rc;
+
+	cpuhw->dev_state = 0;
+	for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc)
+		if ((p->sets & cpumf_ctr_ctl[rc]))
+			atomic_dec(&cpuhw->ctr_set[rc]);
+	rc = lcctl(cpuhw->state);	/* Keep perf_event_open counter sets */
+	if (rc)
+		pr_err("Counter set stop %#llx of /dev/%s failed rc=%i\n",
+		       cpuhw->state, S390_HWCTR_DEVICE, rc);
+	cpuhw->flags &= ~PMU_F_IN_USE;
+	debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n",
+			    __func__, rc, cpuhw->state, cpuhw->dev_state);
+}
+
+/* Start counter sets on particular CPU */
+static void cfset_ioctl_on(void *parm)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	struct cfset_call_on_cpu_parm *p = parm;
+	int rc;
+
+	cpuhw->flags |= PMU_F_IN_USE;
+	ctr_set_enable(&cpuhw->dev_state, p->sets);
+	ctr_set_start(&cpuhw->dev_state, p->sets);
+	for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc)
+		if ((p->sets & cpumf_ctr_ctl[rc]))
+			atomic_inc(&cpuhw->ctr_set[rc]);
+	rc = lcctl(cpuhw->dev_state | cpuhw->state);	/* Start counter sets */
+	if (!rc)
+		atomic_inc(&p->cpus_ack);
+	else
+		pr_err("Counter set start %#llx of /dev/%s failed rc=%i\n",
+		       cpuhw->dev_state | cpuhw->state, S390_HWCTR_DEVICE, rc);
+	debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n",
+			    __func__, rc, cpuhw->state, cpuhw->dev_state);
+}
+
+static void cfset_release_cpu(void *p)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	int rc;
+
+	debug_sprintf_event(cf_dbg, 4, "%s state %#llx dev_state %#llx\n",
+			    __func__, cpuhw->state, cpuhw->dev_state);
+	rc = lcctl(cpuhw->state);	/* Keep perf_event_open counter sets */
+	if (rc)
+		pr_err("Counter set release %#llx of /dev/%s failed rc=%i\n",
+		       cpuhw->state, S390_HWCTR_DEVICE, rc);
+	cpuhw->dev_state = 0;
+}
+
+/* Release function is also called when application gets terminated without
+ * doing a proper ioctl(..., S390_HWCTR_STOP, ...) command.
+ */
+static int cfset_release(struct inode *inode, struct file *file)
+{
+	on_each_cpu(cfset_release_cpu, NULL, 1);
+	hw_perf_event_destroy(NULL);
+	cfset_ctrset_clear();
+	atomic_set(&cfset_opencnt, 0);
+	return 0;
+}
+
+static int cfset_open(struct inode *inode, struct file *file)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	/* Only one user space program can open /dev/hwctr */
+	if (atomic_xchg(&cfset_opencnt, 1))
+		return -EBUSY;
+
+	cpumf_hw_inuse();
+	file->private_data = NULL;
+	/* nonseekable_open() never fails */
+	return nonseekable_open(inode, file);
+}
+
+static int cfset_all_stop(void)
+{
+	struct cfset_call_on_cpu_parm p = {
+		.sets = cfset_request.ctrset,
+	};
+	cpumask_var_t mask;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_and(mask, &cfset_request.mask, cpu_online_mask);
+	on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1);
+	free_cpumask_var(mask);
+	return 0;
+}
+
+static int cfset_all_start(void)
+{
+	struct cfset_call_on_cpu_parm p = {
+		.sets = cfset_request.ctrset,
+		.cpus_ack = ATOMIC_INIT(0),
+	};
+	cpumask_var_t mask;
+	int rc = 0;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_and(mask, &cfset_request.mask, cpu_online_mask);
+	on_each_cpu_mask(mask, cfset_ioctl_on, &p, 1);
+	if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) {
+		on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1);
+		rc = -EIO;
+		debug_sprintf_event(cf_dbg, 4, "%s CPUs missing", __func__);
+	}
+	free_cpumask_var(mask);
+	return rc;
+}
+
+
+/* Return the maximum required space for all possible CPUs in case one
+ * CPU will be onlined during the START, READ, STOP cycles.
+ * To find out the size of the counter sets, any one CPU will do. They
+ * all have the same counter sets.
+ */
+static size_t cfset_needspace(unsigned int sets)
+{
+	struct cpu_cf_events *cpuhw = get_cpu_ptr(&cpu_cf_events);
+	size_t bytes = 0;
+	int i;
+
+	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+		if (!(sets & cpumf_ctr_ctl[i]))
+			continue;
+		bytes += cpum_cf_ctrset_size(i, &cpuhw->info) * sizeof(u64) +
+			 sizeof(((struct s390_ctrset_setdata *)0)->set) +
+			 sizeof(((struct s390_ctrset_setdata *)0)->no_cnts);
+	}
+	bytes = sizeof(((struct s390_ctrset_read *)0)->no_cpus) + nr_cpu_ids *
+		(bytes + sizeof(((struct s390_ctrset_cpudata *)0)->cpu_nr) +
+		     sizeof(((struct s390_ctrset_cpudata *)0)->no_sets));
+	put_cpu_ptr(&cpu_cf_events);
+	return bytes;
+}
+
+static int cfset_all_copy(unsigned long arg, cpumask_t *mask)
+{
+	struct s390_ctrset_read __user *ctrset_read;
+	unsigned int cpu, cpus, rc;
+	void __user *uptr;
+
+	ctrset_read = (struct s390_ctrset_read __user *)arg;
+	uptr = ctrset_read->data;
+	for_each_cpu(cpu, mask) {
+		struct cpu_cf_events *cpuhw = per_cpu_ptr(&cpu_cf_events, cpu);
+		struct s390_ctrset_cpudata __user *ctrset_cpudata;
+
+		ctrset_cpudata = uptr;
+		rc  = put_user(cpu, &ctrset_cpudata->cpu_nr);
+		rc |= put_user(cpuhw->sets, &ctrset_cpudata->no_sets);
+		rc |= copy_to_user(ctrset_cpudata->data, cpuhw->data,
+				   cpuhw->used);
+		if (rc)
+			return -EFAULT;
+		uptr += sizeof(struct s390_ctrset_cpudata) + cpuhw->used;
+		cond_resched();
+	}
+	cpus = cpumask_weight(mask);
+	if (put_user(cpus, &ctrset_read->no_cpus))
+		return -EFAULT;
+	debug_sprintf_event(cf_dbg, 4, "%s copied %ld\n", __func__,
+			    uptr - (void __user *)ctrset_read->data);
+	return 0;
+}
+
+static size_t cfset_cpuset_read(struct s390_ctrset_setdata *p, int ctrset,
+				int ctrset_size, size_t room)
+{
+	size_t need = 0;
+	int rc = -1;
+
+	need = sizeof(*p) + sizeof(u64) * ctrset_size;
+	if (need <= room) {
+		p->set = cpumf_ctr_ctl[ctrset];
+		p->no_cnts = ctrset_size;
+		rc = ctr_stcctm(ctrset, ctrset_size, (u64 *)p->cv);
+		if (rc == 3)		/* Nothing stored */
+			need = 0;
+	}
+	return need;
+}
+
+/* Read all counter sets. */
+static void cfset_cpu_read(void *parm)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	struct cfset_call_on_cpu_parm *p = parm;
+	int set, set_size;
+	size_t space;
+
+	/* No data saved yet */
+	cpuhw->used = 0;
+	cpuhw->sets = 0;
+	memset(cpuhw->data, 0, sizeof(cpuhw->data));
+
+	/* Scan the counter sets */
+	for (set = CPUMF_CTR_SET_BASIC; set < CPUMF_CTR_SET_MAX; ++set) {
+		struct s390_ctrset_setdata *sp = (void *)cpuhw->data +
+						 cpuhw->used;
+
+		if (!(p->sets & cpumf_ctr_ctl[set]))
+			continue;	/* Counter set not in list */
+		set_size = cpum_cf_ctrset_size(set, &cpuhw->info);
+		space = sizeof(cpuhw->data) - cpuhw->used;
+		space = cfset_cpuset_read(sp, set, set_size, space);
+		if (space) {
+			cpuhw->used += space;
+			cpuhw->sets += 1;
+		}
+	}
+	debug_sprintf_event(cf_dbg, 4, "%s sets %d used %zd\n", __func__,
+			    cpuhw->sets, cpuhw->used);
+}
+
+static int cfset_all_read(unsigned long arg)
+{
+	struct cfset_call_on_cpu_parm p;
+	cpumask_var_t mask;
+	int rc;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	p.sets = cfset_request.ctrset;
+	cpumask_and(mask, &cfset_request.mask, cpu_online_mask);
+	on_each_cpu_mask(mask, cfset_cpu_read, &p, 1);
+	rc = cfset_all_copy(arg, mask);
+	free_cpumask_var(mask);
+	return rc;
+}
+
+static long cfset_ioctl_read(unsigned long arg)
+{
+	struct s390_ctrset_read read;
+	int ret = 0;
+
+	if (copy_from_user(&read, (char __user *)arg, sizeof(read)))
+		return -EFAULT;
+	ret = cfset_all_read(arg);
+	return ret;
+}
+
+static long cfset_ioctl_stop(void)
+{
+	int ret = ENXIO;
+
+	if (cfset_request.ctrset) {
+		ret = cfset_all_stop();
+		cfset_ctrset_clear();
+	}
+	return ret;
+}
+
+static long cfset_ioctl_start(unsigned long arg)
+{
+	struct s390_ctrset_start __user *ustart;
+	struct s390_ctrset_start start;
+	void __user *umask;
+	unsigned int len;
+	int ret = 0;
+	size_t need;
+
+	if (cfset_request.ctrset)
+		return -EBUSY;
+	ustart = (struct s390_ctrset_start __user *)arg;
+	if (copy_from_user(&start, ustart, sizeof(start)))
+		return -EFAULT;
+	if (start.version != S390_HWCTR_START_VERSION)
+		return -EINVAL;
+	if (start.counter_sets & ~(cpumf_ctr_ctl[CPUMF_CTR_SET_BASIC] |
+				   cpumf_ctr_ctl[CPUMF_CTR_SET_USER] |
+				   cpumf_ctr_ctl[CPUMF_CTR_SET_CRYPTO] |
+				   cpumf_ctr_ctl[CPUMF_CTR_SET_EXT] |
+				   cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG]))
+		return -EINVAL;		/* Invalid counter set */
+	if (!start.counter_sets)
+		return -EINVAL;		/* No counter set at all? */
+	cpumask_clear(&cfset_request.mask);
+	len = min_t(u64, start.cpumask_len, cpumask_size());
+	umask = (void __user *)start.cpumask;
+	if (copy_from_user(&cfset_request.mask, umask, len))
+		return -EFAULT;
+	if (cpumask_empty(&cfset_request.mask))
+		return -EINVAL;
+	need = cfset_needspace(start.counter_sets);
+	if (put_user(need, &ustart->data_bytes))
+		ret = -EFAULT;
+	if (ret)
+		goto out;
+	cfset_request.ctrset = start.counter_sets;
+	ret = cfset_all_start();
+out:
+	if (ret)
+		cfset_ctrset_clear();
+	debug_sprintf_event(cf_dbg, 4, "%s sets %#lx need %ld ret %d\n",
+			    __func__, cfset_request.ctrset, need, ret);
+	return ret;
+}
+
+/* Entry point to the /dev/hwctr device interface.
+ * The ioctl system call supports three subcommands:
+ * S390_HWCTR_START: Start the specified counter sets on a CPU list. The
+ *    counter set keeps running until explicitly stopped. Returns the number
+ *    of bytes needed to store the counter values. If another S390_HWCTR_START
+ *    ioctl subcommand is called without a previous S390_HWCTR_STOP stop
+ *    command, -EBUSY is returned.
+ * S390_HWCTR_READ: Read the counter set values from specified CPU list given
+ *    with the S390_HWCTR_START command.
+ * S390_HWCTR_STOP: Stops the counter sets on the CPU list given with the
+ *    previous S390_HWCTR_START subcommand.
+ */
+static long cfset_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int ret;
+
+	get_online_cpus();
+	mutex_lock(&cfset_ctrset_mutex);
+	switch (cmd) {
+	case S390_HWCTR_START:
+		ret = cfset_ioctl_start(arg);
+		break;
+	case S390_HWCTR_STOP:
+		ret = cfset_ioctl_stop();
+		break;
+	case S390_HWCTR_READ:
+		ret = cfset_ioctl_read(arg);
+		break;
+	default:
+		ret = -ENOTTY;
+		break;
+	}
+	mutex_unlock(&cfset_ctrset_mutex);
+	put_online_cpus();
+	return ret;
+}
+
+static const struct file_operations cfset_fops = {
+	.owner = THIS_MODULE,
+	.open = cfset_open,
+	.release = cfset_release,
+	.unlocked_ioctl	= cfset_ioctl,
+	.compat_ioctl = cfset_ioctl,
+	.llseek = no_llseek
+};
+
+static struct miscdevice cfset_dev = {
+	.name	= S390_HWCTR_DEVICE,
+	.minor	= MISC_DYNAMIC_MINOR,
+	.fops	= &cfset_fops,
+};
+
+int cfset_online_cpu(unsigned int cpu)
+{
+	struct cfset_call_on_cpu_parm p;
+
+	mutex_lock(&cfset_ctrset_mutex);
+	if (cfset_request.ctrset) {
+		p.sets = cfset_request.ctrset;
+		cfset_ioctl_on(&p);
+		cpumask_set_cpu(cpu, &cfset_request.mask);
+	}
+	mutex_unlock(&cfset_ctrset_mutex);
+	return 0;
+}
+
+int cfset_offline_cpu(unsigned int cpu)
+{
+	struct cfset_call_on_cpu_parm p;
+
+	mutex_lock(&cfset_ctrset_mutex);
+	if (cfset_request.ctrset) {
+		p.sets = cfset_request.ctrset;
+		cfset_ioctl_off(&p);
+		cpumask_clear_cpu(cpu, &cfset_request.mask);
+	}
+	mutex_unlock(&cfset_ctrset_mutex);
+	return 0;
+}
+
+static void cfdiag_read(struct perf_event *event)
+{
+	debug_sprintf_event(cf_dbg, 3, "%s event %#llx count %ld\n", __func__,
+			    event->attr.config, local64_read(&event->count));
+}
+
+static int get_authctrsets(void)
+{
+	struct cpu_cf_events *cpuhw;
+	unsigned long auth = 0;
+	enum cpumf_ctr_set i;
+
+	cpuhw = &get_cpu_var(cpu_cf_events);
+	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+		if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i])
+			auth |= cpumf_ctr_ctl[i];
+	}
+	put_cpu_var(cpu_cf_events);
+	return auth;
+}
+
+/* Setup the event. Test for authorized counter sets and only include counter
+ * sets which are authorized at the time of the setup. Including unauthorized
+ * counter sets result in specification exception (and panic).
+ */
+static int cfdiag_event_init2(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	int err = 0;
+
+	/* Set sample_period to indicate sampling */
+	event->hw.config = attr->config;
+	event->hw.sample_period = attr->sample_period;
+	local64_set(&event->hw.period_left, event->hw.sample_period);
+	local64_set(&event->count, 0);
+	event->hw.last_period = event->hw.sample_period;
+
+	/* Add all authorized counter sets to config_base. The
+	 * the hardware init function is either called per-cpu or just once
+	 * for all CPUS (event->cpu == -1).  This depends on the whether
+	 * counting is started for all CPUs or on a per workload base where
+	 * the perf event moves from one CPU to another CPU.
+	 * Checking the authorization on any CPU is fine as the hardware
+	 * applies the same authorization settings to all CPUs.
+	 */
+	event->hw.config_base = get_authctrsets();
+
+	/* No authorized counter sets, nothing to count/sample */
+	if (!event->hw.config_base)
+		err = -EINVAL;
+
+	debug_sprintf_event(cf_dbg, 5, "%s err %d config_base %#lx\n",
+			    __func__, err, event->hw.config_base);
+	return err;
+}
+
+static int cfdiag_event_init(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	int err = -ENOENT;
+
+	if (event->attr.config != PERF_EVENT_CPUM_CF_DIAG ||
+	    event->attr.type != event->pmu->type)
+		goto out;
+
+	/* Raw events are used to access counters directly,
+	 * hence do not permit excludes.
+	 * This event is useless without PERF_SAMPLE_RAW to return counter set
+	 * values as raw data.
+	 */
+	if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv ||
+	    !(attr->sample_type & (PERF_SAMPLE_CPU | PERF_SAMPLE_RAW))) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
+	/* Initialize for using the CPU-measurement counter facility */
+	cpumf_hw_inuse();
+	event->destroy = hw_perf_event_destroy;
+
+	err = cfdiag_event_init2(event);
+	if (unlikely(err))
+		event->destroy(event);
+out:
+	return err;
+}
+
+/* Create cf_diag/events/CF_DIAG event sysfs file. This counter is used
+ * to collect the complete counter sets for a scheduled process. Target
+ * are complete counter sets attached as raw data to the artificial event.
+ * This results in complete counter sets available when a process is
+ * scheduled. Contains the delta of every counter while the process was
+ * running.
+ */
+CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG);
+
+static struct attribute *cfdiag_events_attr[] = {
+	CPUMF_EVENT_PTR(CF_DIAG, CF_DIAG),
+	NULL,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-63");
+
+static struct attribute *cfdiag_format_attr[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+
+static struct attribute_group cfdiag_events_group = {
+	.name = "events",
+	.attrs = cfdiag_events_attr,
+};
+static struct attribute_group cfdiag_format_group = {
+	.name = "format",
+	.attrs = cfdiag_format_attr,
+};
+static const struct attribute_group *cfdiag_attr_groups[] = {
+	&cfdiag_events_group,
+	&cfdiag_format_group,
+	NULL,
+};
+
+/* Performance monitoring unit for event CF_DIAG. Since this event
+ * is also started and stopped via the perf_event_open() system call, use
+ * the same event enable/disable call back functions. They do not
+ * have a pointer to the perf_event strcture as first parameter.
+ *
+ * The functions XXX_add, XXX_del, XXX_start and XXX_stop are also common.
+ * Reuse them and distinguish the event (always first parameter) via
+ * 'config' member.
+ */
+static struct pmu cf_diag = {
+	.task_ctx_nr  = perf_sw_context,
+	.event_init   = cfdiag_event_init,
+	.pmu_enable   = cpumf_pmu_enable,
+	.pmu_disable  = cpumf_pmu_disable,
+	.add	      = cpumf_pmu_add,
+	.del	      = cpumf_pmu_del,
+	.start	      = cpumf_pmu_start,
+	.stop	      = cpumf_pmu_stop,
+	.read	      = cfdiag_read,
+
+	.attr_groups  = cfdiag_attr_groups
+};
+
+/* Calculate memory needed to store all counter sets together with header and
+ * trailer data. This is independent of the counter set authorization which
+ * can vary depending on the configuration.
+ */
+static size_t cfdiag_maxsize(struct cpumf_ctr_info *info)
+{
+	size_t max_size = sizeof(struct cf_trailer_entry);
+	enum cpumf_ctr_set i;
+
+	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+		size_t size = cpum_cf_ctrset_size(i, info);
+
+		if (size)
+			max_size += size * sizeof(u64) +
+				    sizeof(struct cf_ctrset_entry);
+	}
+	return max_size;
+}
+
+/* Get the CPU speed, try sampling facility first and CPU attributes second. */
+static void cfdiag_get_cpu_speed(void)
+{
+	if (cpum_sf_avail()) {			/* Sampling facility first */
+		struct hws_qsi_info_block si;
+
+		memset(&si, 0, sizeof(si));
+		if (!qsi(&si)) {
+			cfdiag_cpu_speed = si.cpu_speed;
+			return;
+		}
+	}
+
+	/* Fallback: CPU speed extract static part. Used in case
+	 * CPU Measurement Sampling Facility is turned off.
+	 */
+	if (test_facility(34)) {
+		unsigned long mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0);
+
+		if (mhz != -1UL)
+			cfdiag_cpu_speed = mhz & 0xffffffff;
+	}
+}
+
+static int cfset_init(void)
+{
+	struct cpumf_ctr_info info;
+	size_t need;
+	int rc;
+
+	if (qctri(&info))
+		return -ENODEV;
+
+	cfdiag_get_cpu_speed();
+	/* Make sure the counter set data fits into predefined buffer. */
+	need = cfdiag_maxsize(&info);
+	if (need > sizeof(((struct cpu_cf_events *)0)->start)) {
+		pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n",
+		       need);
+		return -ENOMEM;
+	}
+
+	rc = misc_register(&cfset_dev);
+	if (rc) {
+		pr_err("Registration of /dev/%s failed rc=%i\n",
+		       cfset_dev.name, rc);
+		goto out;
+	}
+
+	rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1);
+	if (rc) {
+		misc_deregister(&cfset_dev);
+		pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n",
+		       rc);
+	}
+out:
+	return rc;
+}
+
+device_initcall(cpumf_pmu_init);
diff --git a/arch/s390/kernel/perf_cpum_cf_common.c b/arch/s390/kernel/perf_cpum_cf_common.c
index 2300fba..30f0242d 100644
--- a/arch/s390/kernel/perf_cpum_cf_common.c
+++ b/arch/s390/kernel/perf_cpum_cf_common.c
@@ -29,7 +29,11 @@
 	},
 	.alert = ATOMIC64_INIT(0),
 	.state = 0,
+	.dev_state = 0,
 	.flags = 0,
+	.used = 0,
+	.usedss = 0,
+	.sets = 0
 };
 /* Indicator whether the CPU-Measurement Counter Facility Support is ready */
 static bool cpum_cf_initalized;
@@ -96,25 +100,10 @@
 }
 EXPORT_SYMBOL(kernel_cpumcf_avail);
 
-
-/* Reserve/release functions for sharing perf hardware */
-static DEFINE_SPINLOCK(cpumcf_owner_lock);
-static void *cpumcf_owner;
-
 /* Initialize the CPU-measurement counter facility */
 int __kernel_cpumcf_begin(void)
 {
 	int flags = PMC_INIT;
-	int err = 0;
-
-	spin_lock(&cpumcf_owner_lock);
-	if (cpumcf_owner)
-		err = -EBUSY;
-	else
-		cpumcf_owner = __builtin_return_address(0);
-	spin_unlock(&cpumcf_owner_lock);
-	if (err)
-		return err;
 
 	on_each_cpu(cpum_cf_setup_cpu, &flags, 1);
 	irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT);
@@ -144,10 +133,6 @@
 
 	on_each_cpu(cpum_cf_setup_cpu, &flags, 1);
 	irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT);
-
-	spin_lock(&cpumcf_owner_lock);
-	cpumcf_owner = NULL;
-	spin_unlock(&cpumcf_owner_lock);
 }
 EXPORT_SYMBOL(__kernel_cpumcf_end);
 
@@ -161,11 +146,13 @@
 
 static int cpum_cf_online_cpu(unsigned int cpu)
 {
-	return cpum_cf_setup(cpu, PMC_INIT);
+	cpum_cf_setup(cpu, PMC_INIT);
+	return cfset_online_cpu(cpu);
 }
 
 static int cpum_cf_offline_cpu(unsigned int cpu)
 {
+	cfset_offline_cpu(cpu);
 	return cpum_cf_setup(cpu, PMC_RELEASE);
 }
 
diff --git a/arch/s390/kernel/perf_cpum_cf_diag.c b/arch/s390/kernel/perf_cpum_cf_diag.c
deleted file mode 100644
index 08c985c..0000000
--- a/arch/s390/kernel/perf_cpum_cf_diag.c
+++ /dev/null
@@ -1,1148 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Performance event support for s390x - CPU-measurement Counter Sets
- *
- *  Copyright IBM Corp. 2019, 2021
- *  Author(s): Hendrik Brueckner <brueckner@linux.ibm.com>
- *	       Thomas Richer <tmricht@linux.ibm.com>
- */
-#define KMSG_COMPONENT	"cpum_cf_diag"
-#define pr_fmt(fmt)	KMSG_COMPONENT ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/kernel_stat.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/init.h>
-#include <linux/export.h>
-#include <linux/slab.h>
-#include <linux/processor.h>
-#include <linux/miscdevice.h>
-#include <linux/mutex.h>
-
-#include <asm/ctl_reg.h>
-#include <asm/irq.h>
-#include <asm/cpu_mcf.h>
-#include <asm/timex.h>
-#include <asm/debug.h>
-
-#include <asm/hwctrset.h>
-
-#define	CF_DIAG_CTRSET_DEF		0xfeef	/* Counter set header mark */
-						/* interval in seconds */
-static unsigned int cf_diag_cpu_speed;
-static debug_info_t *cf_diag_dbg;
-
-struct cf_diag_csd {			/* Counter set data per CPU */
-	size_t used;			/* Bytes used in data/start */
-	unsigned char start[PAGE_SIZE];	/* Counter set at event start */
-	unsigned char data[PAGE_SIZE];	/* Counter set at event delete */
-	unsigned int sets;		/* # Counter set saved in data */
-};
-static DEFINE_PER_CPU(struct cf_diag_csd, cf_diag_csd);
-
-/* Counter sets are stored as data stream in a page sized memory buffer and
- * exported to user space via raw data attached to the event sample data.
- * Each counter set starts with an eight byte header consisting of:
- * - a two byte eye catcher (0xfeef)
- * - a one byte counter set number
- * - a two byte counter set size (indicates the number of counters in this set)
- * - a three byte reserved value (must be zero) to make the header the same
- *   size as a counter value.
- * All counter values are eight byte in size.
- *
- * All counter sets are followed by a 64 byte trailer.
- * The trailer consists of a:
- * - flag field indicating valid fields when corresponding bit set
- * - the counter facility first and second version number
- * - the CPU speed if nonzero
- * - the time stamp the counter sets have been collected
- * - the time of day (TOD) base value
- * - the machine type.
- *
- * The counter sets are saved when the process is prepared to be executed on a
- * CPU and saved again when the process is going to be removed from a CPU.
- * The difference of both counter sets are calculated and stored in the event
- * sample data area.
- */
-
-struct cf_ctrset_entry {	/* CPU-M CF counter set entry (8 byte) */
-	unsigned int def:16;	/* 0-15  Data Entry Format */
-	unsigned int set:16;	/* 16-31 Counter set identifier */
-	unsigned int ctr:16;	/* 32-47 Number of stored counters */
-	unsigned int res1:16;	/* 48-63 Reserved */
-};
-
-struct cf_trailer_entry {	/* CPU-M CF_DIAG trailer (64 byte) */
-	/* 0 - 7 */
-	union {
-		struct {
-			unsigned int clock_base:1;	/* TOD clock base set */
-			unsigned int speed:1;		/* CPU speed set */
-			/* Measurement alerts */
-			unsigned int mtda:1;	/* Loss of MT ctr. data alert */
-			unsigned int caca:1;	/* Counter auth. change alert */
-			unsigned int lcda:1;	/* Loss of counter data alert */
-		};
-		unsigned long flags;	/* 0-63    All indicators */
-	};
-	/* 8 - 15 */
-	unsigned int cfvn:16;			/* 64-79   Ctr First Version */
-	unsigned int csvn:16;			/* 80-95   Ctr Second Version */
-	unsigned int cpu_speed:32;		/* 96-127  CPU speed */
-	/* 16 - 23 */
-	unsigned long timestamp;		/* 128-191 Timestamp (TOD) */
-	/* 24 - 55 */
-	union {
-		struct {
-			unsigned long progusage1;
-			unsigned long progusage2;
-			unsigned long progusage3;
-			unsigned long tod_base;
-		};
-		unsigned long progusage[4];
-	};
-	/* 56 - 63 */
-	unsigned int mach_type:16;		/* Machine type */
-	unsigned int res1:16;			/* Reserved */
-	unsigned int res2:32;			/* Reserved */
-};
-
-/* Create the trailer data at the end of a page. */
-static void cf_diag_trailer(struct cf_trailer_entry *te)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	struct cpuid cpuid;
-
-	te->cfvn = cpuhw->info.cfvn;		/* Counter version numbers */
-	te->csvn = cpuhw->info.csvn;
-
-	get_cpu_id(&cpuid);			/* Machine type */
-	te->mach_type = cpuid.machine;
-	te->cpu_speed = cf_diag_cpu_speed;
-	if (te->cpu_speed)
-		te->speed = 1;
-	te->clock_base = 1;			/* Save clock base */
-	te->tod_base = tod_clock_base.tod;
-	te->timestamp = get_tod_clock_fast();
-}
-
-/*
- * Change the CPUMF state to active.
- * Enable and activate the CPU-counter sets according
- * to the per-cpu control state.
- */
-static void cf_diag_enable(struct pmu *pmu)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	int err;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s pmu %p cpu %d flags %#x state %#llx\n",
-			    __func__, pmu, smp_processor_id(), cpuhw->flags,
-			    cpuhw->state);
-	if (cpuhw->flags & PMU_F_ENABLED)
-		return;
-
-	err = lcctl(cpuhw->state);
-	if (err) {
-		pr_err("Enabling the performance measuring unit "
-		       "failed with rc=%x\n", err);
-		return;
-	}
-	cpuhw->flags |= PMU_F_ENABLED;
-}
-
-/*
- * Change the CPUMF state to inactive.
- * Disable and enable (inactive) the CPU-counter sets according
- * to the per-cpu control state.
- */
-static void cf_diag_disable(struct pmu *pmu)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	u64 inactive;
-	int err;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s pmu %p cpu %d flags %#x state %#llx\n",
-			    __func__, pmu, smp_processor_id(), cpuhw->flags,
-			    cpuhw->state);
-	if (!(cpuhw->flags & PMU_F_ENABLED))
-		return;
-
-	inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1);
-	err = lcctl(inactive);
-	if (err) {
-		pr_err("Disabling the performance measuring unit "
-		       "failed with rc=%x\n", err);
-		return;
-	}
-	cpuhw->flags &= ~PMU_F_ENABLED;
-}
-
-/* Number of perf events counting hardware events */
-static atomic_t cf_diag_events = ATOMIC_INIT(0);
-/* Used to avoid races in calling reserve/release_cpumf_hardware */
-static DEFINE_MUTEX(cf_diag_reserve_mutex);
-
-/* Release the PMU if event is the last perf event */
-static void cf_diag_perf_event_destroy(struct perf_event *event)
-{
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s event %p cpu %d cf_diag_events %d\n",
-			    __func__, event, smp_processor_id(),
-			    atomic_read(&cf_diag_events));
-	if (atomic_dec_return(&cf_diag_events) == 0)
-		__kernel_cpumcf_end();
-}
-
-static int get_authctrsets(void)
-{
-	struct cpu_cf_events *cpuhw;
-	unsigned long auth = 0;
-	enum cpumf_ctr_set i;
-
-	cpuhw = &get_cpu_var(cpu_cf_events);
-	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
-		if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i])
-			auth |= cpumf_ctr_ctl[i];
-	}
-	put_cpu_var(cpu_cf_events);
-	return auth;
-}
-
-/* Setup the event. Test for authorized counter sets and only include counter
- * sets which are authorized at the time of the setup. Including unauthorized
- * counter sets result in specification exception (and panic).
- */
-static int __hw_perf_event_init(struct perf_event *event)
-{
-	struct perf_event_attr *attr = &event->attr;
-	int err = 0;
-
-	debug_sprintf_event(cf_diag_dbg, 5, "%s event %p cpu %d\n", __func__,
-			    event, event->cpu);
-
-	event->hw.config = attr->config;
-
-	/* Add all authorized counter sets to config_base. The
-	 * the hardware init function is either called per-cpu or just once
-	 * for all CPUS (event->cpu == -1).  This depends on the whether
-	 * counting is started for all CPUs or on a per workload base where
-	 * the perf event moves from one CPU to another CPU.
-	 * Checking the authorization on any CPU is fine as the hardware
-	 * applies the same authorization settings to all CPUs.
-	 */
-	event->hw.config_base = get_authctrsets();
-
-	/* No authorized counter sets, nothing to count/sample */
-	if (!event->hw.config_base) {
-		err = -EINVAL;
-		goto out;
-	}
-
-	/* Set sample_period to indicate sampling */
-	event->hw.sample_period = attr->sample_period;
-	local64_set(&event->hw.period_left, event->hw.sample_period);
-	event->hw.last_period  = event->hw.sample_period;
-out:
-	debug_sprintf_event(cf_diag_dbg, 5, "%s err %d config_base %#lx\n",
-			    __func__, err, event->hw.config_base);
-	return err;
-}
-
-/* Return 0 if the CPU-measurement counter facility is currently free
- * and an error otherwise.
- */
-static int cf_diag_perf_event_inuse(void)
-{
-	int err = 0;
-
-	if (!atomic_inc_not_zero(&cf_diag_events)) {
-		mutex_lock(&cf_diag_reserve_mutex);
-		if (atomic_read(&cf_diag_events) == 0 &&
-		    __kernel_cpumcf_begin())
-			err = -EBUSY;
-		else
-			err = atomic_inc_return(&cf_diag_events);
-		mutex_unlock(&cf_diag_reserve_mutex);
-	}
-	return err;
-}
-
-static int cf_diag_event_init(struct perf_event *event)
-{
-	struct perf_event_attr *attr = &event->attr;
-	int err = -ENOENT;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s event %p cpu %d config %#llx type:%u "
-			    "sample_type %#llx cf_diag_events %d\n", __func__,
-			    event, event->cpu, attr->config, event->pmu->type,
-			    attr->sample_type, atomic_read(&cf_diag_events));
-
-	if (event->attr.config != PERF_EVENT_CPUM_CF_DIAG ||
-	    event->attr.type != event->pmu->type)
-		goto out;
-
-	/* Raw events are used to access counters directly,
-	 * hence do not permit excludes.
-	 * This event is usesless without PERF_SAMPLE_RAW to return counter set
-	 * values as raw data.
-	 */
-	if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv ||
-	    !(attr->sample_type & (PERF_SAMPLE_CPU | PERF_SAMPLE_RAW))) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
-	/* Initialize for using the CPU-measurement counter facility */
-	err = cf_diag_perf_event_inuse();
-	if (err < 0)
-		goto out;
-	event->destroy = cf_diag_perf_event_destroy;
-
-	err = __hw_perf_event_init(event);
-	if (unlikely(err))
-		event->destroy(event);
-out:
-	debug_sprintf_event(cf_diag_dbg, 5, "%s err %d\n", __func__, err);
-	return err;
-}
-
-static void cf_diag_read(struct perf_event *event)
-{
-	debug_sprintf_event(cf_diag_dbg, 5, "%s event %p\n", __func__, event);
-}
-
-/* Calculate memory needed to store all counter sets together with header and
- * trailer data. This is independend of the counter set authorization which
- * can vary depending on the configuration.
- */
-static size_t cf_diag_ctrset_maxsize(struct cpumf_ctr_info *info)
-{
-	size_t max_size = sizeof(struct cf_trailer_entry);
-	enum cpumf_ctr_set i;
-
-	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
-		size_t size = cpum_cf_ctrset_size(i, info);
-
-		if (size)
-			max_size += size * sizeof(u64) +
-				    sizeof(struct cf_ctrset_entry);
-	}
-	debug_sprintf_event(cf_diag_dbg, 5, "%s max_size %zu\n", __func__,
-			    max_size);
-
-	return max_size;
-}
-
-/* Read a counter set. The counter set number determines which counter set and
- * the CPUM-CF first and second version number determine the number of
- * available counters in this counter set.
- * Each counter set starts with header containing the counter set number and
- * the number of 8 byte counters.
- *
- * The functions returns the number of bytes occupied by this counter set
- * including the header.
- * If there is no counter in the counter set, this counter set is useless and
- * zero is returned on this case.
- */
-static size_t cf_diag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset,
-				size_t room)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	size_t ctrset_size, need = 0;
-	int rc = 3;				/* Assume write failure */
-
-	ctrdata->def = CF_DIAG_CTRSET_DEF;
-	ctrdata->set = ctrset;
-	ctrdata->res1 = 0;
-	ctrset_size = cpum_cf_ctrset_size(ctrset, &cpuhw->info);
-
-	if (ctrset_size) {			/* Save data */
-		need = ctrset_size * sizeof(u64) + sizeof(*ctrdata);
-		if (need <= room)
-			rc = ctr_stcctm(ctrset, ctrset_size,
-					(u64 *)(ctrdata + 1));
-		if (rc != 3)
-			ctrdata->ctr = ctrset_size;
-		else
-			need = 0;
-	}
-
-	debug_sprintf_event(cf_diag_dbg, 6,
-			    "%s ctrset %d ctrset_size %zu cfvn %d csvn %d"
-			    " need %zd rc %d\n",
-			    __func__, ctrset, ctrset_size, cpuhw->info.cfvn,
-			    cpuhw->info.csvn, need, rc);
-	return need;
-}
-
-/* Read out all counter sets and save them in the provided data buffer.
- * The last 64 byte host an artificial trailer entry.
- */
-static size_t cf_diag_getctr(void *data, size_t sz, unsigned long auth)
-{
-	struct cf_trailer_entry *trailer;
-	size_t offset = 0, done;
-	int i;
-
-	memset(data, 0, sz);
-	sz -= sizeof(*trailer);			/* Always room for trailer */
-	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
-		struct cf_ctrset_entry *ctrdata = data + offset;
-
-		if (!(auth & cpumf_ctr_ctl[i]))
-			continue;	/* Counter set not authorized */
-
-		done = cf_diag_getctrset(ctrdata, i, sz - offset);
-		offset += done;
-		debug_sprintf_event(cf_diag_dbg, 6,
-				    "%s ctrset %d offset %zu done %zu\n",
-				     __func__, i, offset, done);
-	}
-	trailer = data + offset;
-	cf_diag_trailer(trailer);
-	return offset + sizeof(*trailer);
-}
-
-/* Calculate the difference for each counter in a counter set. */
-static void cf_diag_diffctrset(u64 *pstart, u64 *pstop, int counters)
-{
-	for (; --counters >= 0; ++pstart, ++pstop)
-		if (*pstop >= *pstart)
-			*pstop -= *pstart;
-		else
-			*pstop = *pstart - *pstop;
-}
-
-/* Scan the counter sets and calculate the difference of each counter
- * in each set. The result is the increment of each counter during the
- * period the counter set has been activated.
- *
- * Return true on success.
- */
-static int cf_diag_diffctr(struct cf_diag_csd *csd, unsigned long auth)
-{
-	struct cf_trailer_entry *trailer_start, *trailer_stop;
-	struct cf_ctrset_entry *ctrstart, *ctrstop;
-	size_t offset = 0;
-
-	auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1;
-	do {
-		ctrstart = (struct cf_ctrset_entry *)(csd->start + offset);
-		ctrstop = (struct cf_ctrset_entry *)(csd->data + offset);
-
-		if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) {
-			pr_err("cpum_cf_diag counter set compare error "
-				"in set %i\n", ctrstart->set);
-			return 0;
-		}
-		auth &= ~cpumf_ctr_ctl[ctrstart->set];
-		if (ctrstart->def == CF_DIAG_CTRSET_DEF) {
-			cf_diag_diffctrset((u64 *)(ctrstart + 1),
-					  (u64 *)(ctrstop + 1), ctrstart->ctr);
-			offset += ctrstart->ctr * sizeof(u64) +
-				  sizeof(*ctrstart);
-		}
-		debug_sprintf_event(cf_diag_dbg, 6,
-				    "%s set %d ctr %d offset %zu auth %lx\n",
-				    __func__, ctrstart->set, ctrstart->ctr,
-				    offset, auth);
-	} while (ctrstart->def && auth);
-
-	/* Save time_stamp from start of event in stop's trailer */
-	trailer_start = (struct cf_trailer_entry *)(csd->start + offset);
-	trailer_stop = (struct cf_trailer_entry *)(csd->data + offset);
-	trailer_stop->progusage[0] = trailer_start->timestamp;
-
-	return 1;
-}
-
-/* Create perf event sample with the counter sets as raw data.	The sample
- * is then pushed to the event subsystem and the function checks for
- * possible event overflows. If an event overflow occurs, the PMU is
- * stopped.
- *
- * Return non-zero if an event overflow occurred.
- */
-static int cf_diag_push_sample(struct perf_event *event,
-			       struct cf_diag_csd *csd)
-{
-	struct perf_sample_data data;
-	struct perf_raw_record raw;
-	struct pt_regs regs;
-	int overflow;
-
-	/* Setup perf sample */
-	perf_sample_data_init(&data, 0, event->hw.last_period);
-	memset(&regs, 0, sizeof(regs));
-	memset(&raw, 0, sizeof(raw));
-
-	if (event->attr.sample_type & PERF_SAMPLE_CPU)
-		data.cpu_entry.cpu = event->cpu;
-	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
-		raw.frag.size = csd->used;
-		raw.frag.data = csd->data;
-		raw.size = csd->used;
-		data.raw = &raw;
-	}
-
-	overflow = perf_event_overflow(event, &data, &regs);
-	debug_sprintf_event(cf_diag_dbg, 6,
-			    "%s event %p cpu %d sample_type %#llx raw %d "
-			    "ov %d\n", __func__, event, event->cpu,
-			    event->attr.sample_type, raw.size, overflow);
-	if (overflow)
-		event->pmu->stop(event, 0);
-
-	perf_event_update_userpage(event);
-	return overflow;
-}
-
-static void cf_diag_start(struct perf_event *event, int flags)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd);
-	struct hw_perf_event *hwc = &event->hw;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s event %p cpu %d flags %#x hwc-state %#x\n",
-			    __func__, event, event->cpu, flags, hwc->state);
-	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
-		return;
-
-	/* (Re-)enable and activate all counter sets */
-	lcctl(0);		/* Reset counter sets */
-	hwc->state = 0;
-	ctr_set_multiple_enable(&cpuhw->state, hwc->config_base);
-	lcctl(cpuhw->state);	/* Enable counter sets */
-	csd->used = cf_diag_getctr(csd->start, sizeof(csd->start),
-				   event->hw.config_base);
-	ctr_set_multiple_start(&cpuhw->state, hwc->config_base);
-	/* Function cf_diag_enable() starts the counter sets. */
-}
-
-static void cf_diag_stop(struct perf_event *event, int flags)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd);
-	struct hw_perf_event *hwc = &event->hw;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s event %p cpu %d flags %#x hwc-state %#x\n",
-			    __func__, event, event->cpu, flags, hwc->state);
-
-	/* Deactivate all counter sets */
-	ctr_set_multiple_stop(&cpuhw->state, hwc->config_base);
-	local64_inc(&event->count);
-	csd->used = cf_diag_getctr(csd->data, sizeof(csd->data),
-				   event->hw.config_base);
-	if (cf_diag_diffctr(csd, event->hw.config_base))
-		cf_diag_push_sample(event, csd);
-	hwc->state |= PERF_HES_STOPPED;
-}
-
-static int cf_diag_add(struct perf_event *event, int flags)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	int err = 0;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s event %p cpu %d flags %#x cpuhw %p\n",
-			    __func__, event, event->cpu, flags, cpuhw);
-
-	if (cpuhw->flags & PMU_F_IN_USE) {
-		err = -EAGAIN;
-		goto out;
-	}
-
-	event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-
-	cpuhw->flags |= PMU_F_IN_USE;
-	if (flags & PERF_EF_START)
-		cf_diag_start(event, PERF_EF_RELOAD);
-out:
-	debug_sprintf_event(cf_diag_dbg, 5, "%s err %d\n", __func__, err);
-	return err;
-}
-
-static void cf_diag_del(struct perf_event *event, int flags)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s event %p cpu %d flags %#x\n",
-			   __func__, event, event->cpu, flags);
-
-	cf_diag_stop(event, PERF_EF_UPDATE);
-	ctr_set_multiple_stop(&cpuhw->state, event->hw.config_base);
-	ctr_set_multiple_disable(&cpuhw->state, event->hw.config_base);
-	cpuhw->flags &= ~PMU_F_IN_USE;
-}
-
-/* Default counter set events and format attribute groups */
-
-CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG);
-
-static struct attribute *cf_diag_events_attr[] = {
-	CPUMF_EVENT_PTR(CF_DIAG, CF_DIAG),
-	NULL,
-};
-
-PMU_FORMAT_ATTR(event, "config:0-63");
-
-static struct attribute *cf_diag_format_attr[] = {
-	&format_attr_event.attr,
-	NULL,
-};
-
-static struct attribute_group cf_diag_events_group = {
-	.name = "events",
-	.attrs = cf_diag_events_attr,
-};
-static struct attribute_group cf_diag_format_group = {
-	.name = "format",
-	.attrs = cf_diag_format_attr,
-};
-static const struct attribute_group *cf_diag_attr_groups[] = {
-	&cf_diag_events_group,
-	&cf_diag_format_group,
-	NULL,
-};
-
-/* Performance monitoring unit for s390x */
-static struct pmu cf_diag = {
-	.task_ctx_nr  = perf_sw_context,
-	.pmu_enable   = cf_diag_enable,
-	.pmu_disable  = cf_diag_disable,
-	.event_init   = cf_diag_event_init,
-	.add	      = cf_diag_add,
-	.del	      = cf_diag_del,
-	.start	      = cf_diag_start,
-	.stop	      = cf_diag_stop,
-	.read	      = cf_diag_read,
-
-	.attr_groups  = cf_diag_attr_groups
-};
-
-/* Get the CPU speed, try sampling facility first and CPU attributes second. */
-static void cf_diag_get_cpu_speed(void)
-{
-	if (cpum_sf_avail()) {			/* Sampling facility first */
-		struct hws_qsi_info_block si;
-
-		memset(&si, 0, sizeof(si));
-		if (!qsi(&si)) {
-			cf_diag_cpu_speed = si.cpu_speed;
-			return;
-		}
-	}
-
-	if (test_facility(34)) {		/* CPU speed extract static part */
-		unsigned long mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0);
-
-		if (mhz != -1UL)
-			cf_diag_cpu_speed = mhz & 0xffffffff;
-	}
-}
-
-/* Code to create device and file I/O operations */
-static atomic_t ctrset_opencnt = ATOMIC_INIT(0);	/* Excl. access */
-
-static int cf_diag_open(struct inode *inode, struct file *file)
-{
-	int err = 0;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-	if (atomic_xchg(&ctrset_opencnt, 1))
-		return -EBUSY;
-
-	/* Avoid concurrent access with perf_event_open() system call */
-	mutex_lock(&cf_diag_reserve_mutex);
-	if (atomic_read(&cf_diag_events) || __kernel_cpumcf_begin())
-		err = -EBUSY;
-	mutex_unlock(&cf_diag_reserve_mutex);
-	if (err) {
-		atomic_set(&ctrset_opencnt, 0);
-		return err;
-	}
-	file->private_data = NULL;
-	debug_sprintf_event(cf_diag_dbg, 2, "%s\n", __func__);
-	/* nonseekable_open() never fails */
-	return nonseekable_open(inode, file);
-}
-
-/* Variables for ioctl() interface support */
-static DEFINE_MUTEX(cf_diag_ctrset_mutex);
-static struct cf_diag_ctrset {
-	unsigned long ctrset;		/* Bit mask of counter set to read */
-	cpumask_t mask;			/* CPU mask to read from */
-} cf_diag_ctrset;
-
-static void cf_diag_ctrset_clear(void)
-{
-	cpumask_clear(&cf_diag_ctrset.mask);
-	cf_diag_ctrset.ctrset = 0;
-}
-
-static void cf_diag_release_cpu(void *p)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-
-	debug_sprintf_event(cf_diag_dbg, 3, "%s cpu %d\n", __func__,
-			    smp_processor_id());
-	lcctl(0);		/* Reset counter sets */
-	cpuhw->state = 0;	/* Save state in CPU hardware state */
-}
-
-/* Release function is also called when application gets terminated without
- * doing a proper ioctl(..., S390_HWCTR_STOP, ...) command.
- * Since only one application is allowed to open the device, simple stop all
- * CPU counter sets.
- */
-static int cf_diag_release(struct inode *inode, struct file *file)
-{
-	on_each_cpu(cf_diag_release_cpu, NULL, 1);
-	cf_diag_ctrset_clear();
-	atomic_set(&ctrset_opencnt, 0);
-	__kernel_cpumcf_end();
-	debug_sprintf_event(cf_diag_dbg, 2, "%s\n", __func__);
-	return 0;
-}
-
-struct cf_diag_call_on_cpu_parm {	/* Parm struct for smp_call_on_cpu */
-	unsigned int sets;		/* Counter set bit mask */
-	atomic_t cpus_ack;		/* # CPUs successfully executed func */
-};
-
-static int cf_diag_all_copy(unsigned long arg, cpumask_t *mask)
-{
-	struct s390_ctrset_read __user *ctrset_read;
-	unsigned int cpu, cpus, rc;
-	void __user *uptr;
-
-	ctrset_read = (struct s390_ctrset_read __user *)arg;
-	uptr = ctrset_read->data;
-	for_each_cpu(cpu, mask) {
-		struct cf_diag_csd *csd = per_cpu_ptr(&cf_diag_csd, cpu);
-		struct s390_ctrset_cpudata __user *ctrset_cpudata;
-
-		ctrset_cpudata = uptr;
-		debug_sprintf_event(cf_diag_dbg, 5, "%s cpu %d used %zd\n",
-				    __func__, cpu, csd->used);
-		rc  = put_user(cpu, &ctrset_cpudata->cpu_nr);
-		rc |= put_user(csd->sets, &ctrset_cpudata->no_sets);
-		rc |= copy_to_user(ctrset_cpudata->data, csd->data, csd->used);
-		if (rc)
-			return -EFAULT;
-		uptr += sizeof(struct s390_ctrset_cpudata) + csd->used;
-		cond_resched();
-	}
-	cpus = cpumask_weight(mask);
-	if (put_user(cpus, &ctrset_read->no_cpus))
-		return -EFAULT;
-	debug_sprintf_event(cf_diag_dbg, 5, "%s copied %ld\n",
-			    __func__, uptr - (void __user *)ctrset_read->data);
-	return 0;
-}
-
-static size_t cf_diag_cpuset_read(struct s390_ctrset_setdata *p, int ctrset,
-				  int ctrset_size, size_t room)
-{
-	size_t need = 0;
-	int rc = -1;
-
-	need = sizeof(*p) + sizeof(u64) * ctrset_size;
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s room %zd need %zd set %#x set_size %d\n",
-			    __func__, room, need, ctrset, ctrset_size);
-	if (need <= room) {
-		p->set = cpumf_ctr_ctl[ctrset];
-		p->no_cnts = ctrset_size;
-		rc = ctr_stcctm(ctrset, ctrset_size, (u64 *)p->cv);
-		if (rc == 3)		/* Nothing stored */
-			need = 0;
-	}
-	debug_sprintf_event(cf_diag_dbg, 5, "%s need %zd rc %d\n", __func__,
-			    need, rc);
-	return need;
-}
-
-/* Read all counter sets. Since the perf_event_open() system call with
- * event cpum_cf_diag/.../ is blocked when this interface is active, reuse
- * the perf_event_open() data buffer to store the counter sets.
- */
-static void cf_diag_cpu_read(void *parm)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd);
-	struct cf_diag_call_on_cpu_parm *p = parm;
-	int set, set_size;
-	size_t space;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s new %#x flags %#x state %#llx\n",
-			    __func__, p->sets, cpuhw->flags,
-			    cpuhw->state);
-	/* No data saved yet */
-	csd->used = 0;
-	csd->sets = 0;
-	memset(csd->data, 0, sizeof(csd->data));
-
-	/* Scan the counter sets */
-	for (set = CPUMF_CTR_SET_BASIC; set < CPUMF_CTR_SET_MAX; ++set) {
-		struct s390_ctrset_setdata *sp = (void *)csd->data + csd->used;
-
-		if (!(p->sets & cpumf_ctr_ctl[set]))
-			continue;	/* Counter set not in list */
-		set_size = cpum_cf_ctrset_size(set, &cpuhw->info);
-		space = sizeof(csd->data) - csd->used;
-		space = cf_diag_cpuset_read(sp, set, set_size, space);
-		if (space) {
-			csd->used += space;
-			csd->sets += 1;
-		}
-		debug_sprintf_event(cf_diag_dbg, 5, "%s sp %px space %zd\n",
-				    __func__, sp, space);
-	}
-	debug_sprintf_event(cf_diag_dbg, 5, "%s sets %d used %zd\n", __func__,
-			    csd->sets, csd->used);
-}
-
-static int cf_diag_all_read(unsigned long arg)
-{
-	struct cf_diag_call_on_cpu_parm p;
-	cpumask_var_t mask;
-	int rc;
-
-	debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__);
-	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
-		return -ENOMEM;
-
-	p.sets = cf_diag_ctrset.ctrset;
-	cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask);
-	on_each_cpu_mask(mask, cf_diag_cpu_read, &p, 1);
-	rc = cf_diag_all_copy(arg, mask);
-	free_cpumask_var(mask);
-	debug_sprintf_event(cf_diag_dbg, 5, "%s rc %d\n", __func__, rc);
-	return rc;
-}
-
-/* Stop all counter sets via ioctl interface */
-static void cf_diag_ioctl_off(void *parm)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	struct cf_diag_call_on_cpu_parm *p = parm;
-	int rc;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s new %#x flags %#x state %#llx\n",
-			    __func__, p->sets, cpuhw->flags,
-			    cpuhw->state);
-
-	ctr_set_multiple_disable(&cpuhw->state, p->sets);
-	ctr_set_multiple_stop(&cpuhw->state, p->sets);
-	rc = lcctl(cpuhw->state);		/* Stop counter sets */
-	if (!cpuhw->state)
-		cpuhw->flags &= ~PMU_F_IN_USE;
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s rc %d flags %#x state %#llx\n", __func__,
-			     rc, cpuhw->flags, cpuhw->state);
-}
-
-/* Start counter sets on particular CPU */
-static void cf_diag_ioctl_on(void *parm)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	struct cf_diag_call_on_cpu_parm *p = parm;
-	int rc;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s new %#x flags %#x state %#llx\n",
-			    __func__, p->sets, cpuhw->flags,
-			    cpuhw->state);
-
-	if (!(cpuhw->flags & PMU_F_IN_USE))
-		cpuhw->state = 0;
-	cpuhw->flags |= PMU_F_IN_USE;
-	rc = lcctl(cpuhw->state);		/* Reset unused counter sets */
-	ctr_set_multiple_enable(&cpuhw->state, p->sets);
-	ctr_set_multiple_start(&cpuhw->state, p->sets);
-	rc |= lcctl(cpuhw->state);		/* Start counter sets */
-	if (!rc)
-		atomic_inc(&p->cpus_ack);
-	debug_sprintf_event(cf_diag_dbg, 5, "%s rc %d state %#llx\n",
-			    __func__, rc, cpuhw->state);
-}
-
-static int cf_diag_all_stop(void)
-{
-	struct cf_diag_call_on_cpu_parm p = {
-		.sets = cf_diag_ctrset.ctrset,
-	};
-	cpumask_var_t mask;
-
-	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
-		return -ENOMEM;
-	cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask);
-	on_each_cpu_mask(mask, cf_diag_ioctl_off, &p, 1);
-	free_cpumask_var(mask);
-	return 0;
-}
-
-static int cf_diag_all_start(void)
-{
-	struct cf_diag_call_on_cpu_parm p = {
-		.sets = cf_diag_ctrset.ctrset,
-		.cpus_ack = ATOMIC_INIT(0),
-	};
-	cpumask_var_t mask;
-	int rc = 0;
-
-	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
-		return -ENOMEM;
-	cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask);
-	on_each_cpu_mask(mask, cf_diag_ioctl_on, &p, 1);
-	if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) {
-		on_each_cpu_mask(mask, cf_diag_ioctl_off, &p, 1);
-		rc = -EIO;
-	}
-	free_cpumask_var(mask);
-	return rc;
-}
-
-/* Return the maximum required space for all possible CPUs in case one
- * CPU will be onlined during the START, READ, STOP cycles.
- * To find out the size of the counter sets, any one CPU will do. They
- * all have the same counter sets.
- */
-static size_t cf_diag_needspace(unsigned int sets)
-{
-	struct cpu_cf_events *cpuhw = get_cpu_ptr(&cpu_cf_events);
-	size_t bytes = 0;
-	int i;
-
-	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
-		if (!(sets & cpumf_ctr_ctl[i]))
-			continue;
-		bytes += cpum_cf_ctrset_size(i, &cpuhw->info) * sizeof(u64) +
-			 sizeof(((struct s390_ctrset_setdata *)0)->set) +
-			 sizeof(((struct s390_ctrset_setdata *)0)->no_cnts);
-	}
-	bytes = sizeof(((struct s390_ctrset_read *)0)->no_cpus) + nr_cpu_ids *
-		(bytes + sizeof(((struct s390_ctrset_cpudata *)0)->cpu_nr) +
-		     sizeof(((struct s390_ctrset_cpudata *)0)->no_sets));
-	debug_sprintf_event(cf_diag_dbg, 5, "%s bytes %ld\n", __func__,
-			    bytes);
-	put_cpu_ptr(&cpu_cf_events);
-	return bytes;
-}
-
-static long cf_diag_ioctl_read(unsigned long arg)
-{
-	struct s390_ctrset_read read;
-	int ret = 0;
-
-	debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__);
-	if (copy_from_user(&read, (char __user *)arg, sizeof(read)))
-		return -EFAULT;
-	ret = cf_diag_all_read(arg);
-	debug_sprintf_event(cf_diag_dbg, 5, "%s ret %d\n", __func__, ret);
-	return ret;
-}
-
-static long cf_diag_ioctl_stop(void)
-{
-	int ret;
-
-	debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__);
-	ret = cf_diag_all_stop();
-	cf_diag_ctrset_clear();
-	debug_sprintf_event(cf_diag_dbg, 5, "%s ret %d\n", __func__, ret);
-	return ret;
-}
-
-static long cf_diag_ioctl_start(unsigned long arg)
-{
-	struct s390_ctrset_start __user *ustart;
-	struct s390_ctrset_start start;
-	void __user *umask;
-	unsigned int len;
-	int ret = 0;
-	size_t need;
-
-	if (cf_diag_ctrset.ctrset)
-		return -EBUSY;
-	ustart = (struct s390_ctrset_start __user *)arg;
-	if (copy_from_user(&start, ustart, sizeof(start)))
-		return -EFAULT;
-	if (start.version != S390_HWCTR_START_VERSION)
-		return -EINVAL;
-	if (start.counter_sets & ~(cpumf_ctr_ctl[CPUMF_CTR_SET_BASIC] |
-				   cpumf_ctr_ctl[CPUMF_CTR_SET_USER] |
-				   cpumf_ctr_ctl[CPUMF_CTR_SET_CRYPTO] |
-				   cpumf_ctr_ctl[CPUMF_CTR_SET_EXT] |
-				   cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG]))
-		return -EINVAL;		/* Invalid counter set */
-	if (!start.counter_sets)
-		return -EINVAL;		/* No counter set at all? */
-	cpumask_clear(&cf_diag_ctrset.mask);
-	len = min_t(u64, start.cpumask_len, cpumask_size());
-	umask = (void __user *)start.cpumask;
-	if (copy_from_user(&cf_diag_ctrset.mask, umask, len))
-		return -EFAULT;
-	if (cpumask_empty(&cf_diag_ctrset.mask))
-		return -EINVAL;
-	need = cf_diag_needspace(start.counter_sets);
-	if (put_user(need, &ustart->data_bytes))
-		ret = -EFAULT;
-	if (ret)
-		goto out;
-	cf_diag_ctrset.ctrset = start.counter_sets;
-	ret = cf_diag_all_start();
-out:
-	if (ret)
-		cf_diag_ctrset_clear();
-	debug_sprintf_event(cf_diag_dbg, 2, "%s sets %#lx need %ld ret %d\n",
-			    __func__, cf_diag_ctrset.ctrset, need, ret);
-	return ret;
-}
-
-static long cf_diag_ioctl(struct file *file, unsigned int cmd,
-			  unsigned long arg)
-{
-	int ret;
-
-	debug_sprintf_event(cf_diag_dbg, 2, "%s cmd %#x arg %lx\n", __func__,
-			    cmd, arg);
-	get_online_cpus();
-	mutex_lock(&cf_diag_ctrset_mutex);
-	switch (cmd) {
-	case S390_HWCTR_START:
-		ret = cf_diag_ioctl_start(arg);
-		break;
-	case S390_HWCTR_STOP:
-		ret = cf_diag_ioctl_stop();
-		break;
-	case S390_HWCTR_READ:
-		ret = cf_diag_ioctl_read(arg);
-		break;
-	default:
-		ret = -ENOTTY;
-		break;
-	}
-	mutex_unlock(&cf_diag_ctrset_mutex);
-	put_online_cpus();
-	debug_sprintf_event(cf_diag_dbg, 2, "%s ret %d\n", __func__, ret);
-	return ret;
-}
-
-static const struct file_operations cf_diag_fops = {
-	.owner = THIS_MODULE,
-	.open = cf_diag_open,
-	.release = cf_diag_release,
-	.unlocked_ioctl	= cf_diag_ioctl,
-	.compat_ioctl = cf_diag_ioctl,
-	.llseek = no_llseek
-};
-
-static struct miscdevice cf_diag_dev = {
-	.name	= S390_HWCTR_DEVICE,
-	.minor	= MISC_DYNAMIC_MINOR,
-	.fops	= &cf_diag_fops,
-};
-
-static int cf_diag_online_cpu(unsigned int cpu)
-{
-	struct cf_diag_call_on_cpu_parm p;
-
-	mutex_lock(&cf_diag_ctrset_mutex);
-	if (!cf_diag_ctrset.ctrset)
-		goto out;
-	p.sets = cf_diag_ctrset.ctrset;
-	cf_diag_ioctl_on(&p);
-out:
-	mutex_unlock(&cf_diag_ctrset_mutex);
-	return 0;
-}
-
-static int cf_diag_offline_cpu(unsigned int cpu)
-{
-	struct cf_diag_call_on_cpu_parm p;
-
-	mutex_lock(&cf_diag_ctrset_mutex);
-	if (!cf_diag_ctrset.ctrset)
-		goto out;
-	p.sets = cf_diag_ctrset.ctrset;
-	cf_diag_ioctl_off(&p);
-out:
-	mutex_unlock(&cf_diag_ctrset_mutex);
-	return 0;
-}
-
-/* Initialize the counter set PMU to generate complete counter set data as
- * event raw data. This relies on the CPU Measurement Counter Facility device
- * already being loaded and initialized.
- */
-static int __init cf_diag_init(void)
-{
-	struct cpumf_ctr_info info;
-	size_t need;
-	int rc;
-
-	if (!kernel_cpumcf_avail() || !stccm_avail() || qctri(&info))
-		return -ENODEV;
-	cf_diag_get_cpu_speed();
-
-	/* Make sure the counter set data fits into predefined buffer. */
-	need = cf_diag_ctrset_maxsize(&info);
-	if (need > sizeof(((struct cf_diag_csd *)0)->start)) {
-		pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n",
-		       need);
-		return -ENOMEM;
-	}
-
-	rc = misc_register(&cf_diag_dev);
-	if (rc) {
-		pr_err("Registration of /dev/" S390_HWCTR_DEVICE
-		       "failed rc=%d\n", rc);
-		goto out;
-	}
-
-	/* Setup s390dbf facility */
-	cf_diag_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128);
-	if (!cf_diag_dbg) {
-		pr_err("Registration of s390dbf(cpum_cf_diag) failed\n");
-		rc = -ENOMEM;
-		goto out_dbf;
-	}
-	debug_register_view(cf_diag_dbg, &debug_sprintf_view);
-
-	rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1);
-	if (rc) {
-		pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n",
-		       rc);
-		goto out_perf;
-	}
-	rc = cpuhp_setup_state_nocalls(CPUHP_AP_PERF_S390_CFD_ONLINE,
-				       "perf/s390/cfd:online",
-				       cf_diag_online_cpu, cf_diag_offline_cpu);
-	if (!rc)
-		goto out;
-
-	pr_err("Registration of CPUHP_AP_PERF_S390_CFD_ONLINE failed rc=%i\n",
-	       rc);
-	perf_pmu_unregister(&cf_diag);
-out_perf:
-	debug_unregister_view(cf_diag_dbg, &debug_sprintf_view);
-	debug_unregister(cf_diag_dbg);
-out_dbf:
-	misc_deregister(&cf_diag_dev);
-out:
-	return rc;
-}
-device_initcall(cf_diag_init);
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 7ae5dde..350e94d 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -166,6 +166,12 @@
 			p->thread.acrs[1] = (unsigned int)tls;
 		}
 	}
+	/*
+	 * s390 stores the svc return address in arch_data when calling
+	 * sigreturn()/restart_syscall() via vdso. 1 means no valid address
+	 * stored.
+	 */
+	p->restart_block.arch_data = 1;
 	return 0;
 }
 
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 5486d82..ff0f9e8 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -354,7 +354,7 @@
 	set_task_stack_end_magic(current);
 	stack += STACK_INIT_OFFSET;
 	S390_lowcore.kernel_stack = stack;
-	CALL_ON_STACK_NORETURN(rest_init, stack);
+	call_on_stack_noreturn(rest_init, stack);
 }
 
 static void __init setup_lowcore_dat_off(void)
@@ -442,6 +442,7 @@
 	lc->br_r1_trampoline = 0x07f1;	/* br %r1 */
 	lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW);
 	lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW);
+	lc->preempt_count = PREEMPT_DISABLED;
 
 	set_prefix((u32)(unsigned long) lc);
 	lowcore_ptr[0] = lc;
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index 080e7ae..78ef53b 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -32,6 +32,7 @@
 #include <linux/uaccess.h>
 #include <asm/lowcore.h>
 #include <asm/switch_to.h>
+#include <asm/vdso.h>
 #include "entry.h"
 
 /*
@@ -171,7 +172,6 @@
 	fpregs_load(&user_sregs.fpregs, &current->thread.fpu);
 
 	clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */
-	clear_pt_regs_flag(regs, PIF_SYSCALL_RESTART);
 	return 0;
 }
 
@@ -334,15 +334,10 @@
 
 	/* Set up to return from userspace.  If provided, use a stub
 	   already in userspace.  */
-	if (ka->sa.sa_flags & SA_RESTORER) {
+	if (ka->sa.sa_flags & SA_RESTORER)
 		restorer = (unsigned long) ka->sa.sa_restorer;
-	} else {
-		/* Signal frame without vector registers are short ! */
-		__u16 __user *svc = (void __user *) frame + frame_size - 2;
-		if (__put_user(S390_SYSCALL_OPCODE | __NR_sigreturn, svc))
-			return -EFAULT;
-		restorer = (unsigned long) svc;
-	}
+	else
+		restorer = VDSO64_SYMBOL(current, sigreturn);
 
 	/* Set up registers for signal handler */
 	regs->gprs[14] = restorer;
@@ -397,14 +392,10 @@
 
 	/* Set up to return from userspace.  If provided, use a stub
 	   already in userspace.  */
-	if (ksig->ka.sa.sa_flags & SA_RESTORER) {
+	if (ksig->ka.sa.sa_flags & SA_RESTORER)
 		restorer = (unsigned long) ksig->ka.sa.sa_restorer;
-	} else {
-		__u16 __user *svc = &frame->svc_insn;
-		if (__put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, svc))
-			return -EFAULT;
-		restorer = (unsigned long) svc;
-	}
+	else
+		restorer = VDSO64_SYMBOL(current, rt_sigreturn);
 
 	/* Create siginfo on the signal stack */
 	if (copy_siginfo_to_user(&frame->info, &ksig->info))
@@ -501,7 +492,7 @@
 		}
 		/* No longer in a system call */
 		clear_pt_regs_flag(regs, PIF_SYSCALL);
-		clear_pt_regs_flag(regs, PIF_SYSCALL_RESTART);
+
 		rseq_signal_deliver(&ksig, regs);
 		if (is_compat_task())
 			handle_signal32(&ksig, oldset, regs);
@@ -517,14 +508,20 @@
 		switch (regs->gprs[2]) {
 		case -ERESTART_RESTARTBLOCK:
 			/* Restart with sys_restart_syscall */
-			regs->int_code = __NR_restart_syscall;
-			fallthrough;
+			regs->gprs[2] = regs->orig_gpr2;
+			current->restart_block.arch_data = regs->psw.addr;
+			if (is_compat_task())
+				regs->psw.addr = VDSO32_SYMBOL(current, restart_syscall);
+			else
+				regs->psw.addr = VDSO64_SYMBOL(current, restart_syscall);
+			if (test_thread_flag(TIF_SINGLE_STEP))
+				clear_thread_flag(TIF_PER_TRAP);
+			break;
 		case -ERESTARTNOHAND:
 		case -ERESTARTSYS:
 		case -ERESTARTNOINTR:
-			/* Restart system call with magic TIF bit. */
 			regs->gprs[2] = regs->orig_gpr2;
-			set_pt_regs_flag(regs, PIF_SYSCALL_RESTART);
+			regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16);
 			if (test_thread_flag(TIF_SINGLE_STEP))
 				clear_thread_flag(TIF_PER_TRAP);
 			break;
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index ff42d3a..8984711 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -210,6 +210,7 @@
 	lc->br_r1_trampoline = 0x07f1;	/* br %r1 */
 	lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW);
 	lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW);
+	lc->preempt_count = PREEMPT_DISABLED;
 	if (nmi_alloc_per_cpu(lc))
 		goto out;
 	lowcore_ptr[cpu] = lc;
@@ -300,24 +301,28 @@
 	pcpu_sigp_retry(pcpu, SIGP_RESTART, 0);
 }
 
+typedef void (pcpu_delegate_fn)(void *);
+
 /*
  * Call function via PSW restart on pcpu and stop the current cpu.
  */
-static void __pcpu_delegate(void (*func)(void*), void *data)
+static void __pcpu_delegate(pcpu_delegate_fn *func, void *data)
 {
 	func(data);	/* should not return */
 }
 
 static void __no_sanitize_address pcpu_delegate(struct pcpu *pcpu,
-						void (*func)(void *),
+						pcpu_delegate_fn *func,
 						void *data, unsigned long stack)
 {
 	struct lowcore *lc = lowcore_ptr[pcpu - pcpu_devices];
 	unsigned long source_cpu = stap();
 
 	__load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT);
-	if (pcpu->address == source_cpu)
-		CALL_ON_STACK(__pcpu_delegate, stack, 2, func, data);
+	if (pcpu->address == source_cpu) {
+		call_on_stack(2, stack, void, __pcpu_delegate,
+			      pcpu_delegate_fn *, func, void *, data);
+	}
 	/* Stop target cpu (if func returns this stops the current cpu). */
 	pcpu_sigp_retry(pcpu, SIGP_STOP, 0);
 	/* Restart func on the target cpu and stop the current cpu. */
@@ -898,7 +903,7 @@
 	S390_lowcore.restart_source = -1UL;
 	__ctl_load(S390_lowcore.cregs_save_area, 0, 15);
 	__load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT);
-	CALL_ON_STACK_NORETURN(smp_init_secondary, S390_lowcore.kernel_stack);
+	call_on_stack_noreturn(smp_init_secondary, S390_lowcore.kernel_stack);
 }
 
 /* Upping and downing of CPUs */
diff --git a/arch/s390/kernel/syscall.c b/arch/s390/kernel/syscall.c
index 76f7916..8fe2d23b 100644
--- a/arch/s390/kernel/syscall.c
+++ b/arch/s390/kernel/syscall.c
@@ -108,7 +108,7 @@
 	return -ENOSYS;
 }
 
-void do_syscall(struct pt_regs *regs)
+static void do_syscall(struct pt_regs *regs)
 {
 	unsigned long nr;
 
@@ -121,6 +121,10 @@
 
 	regs->gprs[2] = nr;
 
+	if (nr == __NR_restart_syscall && !(current->restart_block.arch_data & 1)) {
+		regs->psw.addr = current->restart_block.arch_data;
+		current->restart_block.arch_data = 1;
+	}
 	nr = syscall_enter_from_user_mode_work(regs, nr);
 
 	/*
@@ -130,13 +134,16 @@
 	 * work, the ptrace code sets PIF_SYSCALL_RET_SET, which is checked here
 	 * and if set, the syscall will be skipped.
 	 */
-	if (!test_pt_regs_flag(regs, PIF_SYSCALL_RET_SET)) {
-		regs->gprs[2] = -ENOSYS;
-		if (likely(nr < NR_syscalls))
-			regs->gprs[2] = current->thread.sys_call_table[nr](regs);
-	} else {
-		clear_pt_regs_flag(regs, PIF_SYSCALL_RET_SET);
-	}
+
+	if (unlikely(test_and_clear_pt_regs_flag(regs, PIF_SYSCALL_RET_SET)))
+		goto out;
+	regs->gprs[2] = -ENOSYS;
+	if (likely(nr >= NR_syscalls))
+		goto out;
+	do {
+		regs->gprs[2] = current->thread.sys_call_table[nr](regs);
+	} while (test_and_clear_pt_regs_flag(regs, PIF_EXECVE_PGSTE_RESTART));
+out:
 	syscall_exit_to_user_mode_work(regs);
 }
 
@@ -154,13 +161,8 @@
 	if (per_trap)
 		set_thread_flag(TIF_PER_TRAP);
 
-	for (;;) {
-		regs->flags = 0;
-		set_pt_regs_flag(regs, PIF_SYSCALL);
-		do_syscall(regs);
-		if (!test_pt_regs_flag(regs, PIF_SYSCALL_RESTART))
-			break;
-		local_irq_enable();
-	}
+	regs->flags = 0;
+	set_pt_regs_flag(regs, PIF_SYSCALL);
+	do_syscall(regs);
 	exit_to_user_mode();
 }
diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index 019c574..7694727 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -277,6 +277,8 @@
 {
 	int val = 1;
 
+	if (!IS_ENABLED(CONFIG_BUG))
+		return;
 	asm volatile(
 		"	mc	0,0\n"
 		"0:	xgr	%0,%0\n"
@@ -299,10 +301,9 @@
 void noinstr __do_pgm_check(struct pt_regs *regs)
 {
 	unsigned long last_break = S390_lowcore.breaking_event_addr;
-	unsigned int trapnr, syscall_redirect = 0;
+	unsigned int trapnr;
 	irqentry_state_t state;
 
-	add_random_kstack_offset();
 	regs->int_code = *(u32 *)&S390_lowcore.pgm_ilc;
 	regs->int_parm_long = S390_lowcore.trans_exc_code;
 
@@ -344,18 +345,9 @@
 	trapnr = regs->int_code & PGM_INT_CODE_MASK;
 	if (trapnr)
 		pgm_check_table[trapnr](regs);
-	syscall_redirect = user_mode(regs) && test_pt_regs_flag(regs, PIF_SYSCALL);
 out:
 	local_irq_disable();
 	irqentry_exit(regs, state);
-
-	if (syscall_redirect) {
-		enter_from_user_mode(regs);
-		local_irq_enable();
-		regs->orig_gpr2 = regs->gprs[2];
-		do_syscall(regs);
-		exit_to_user_mode();
-	}
 }
 
 /*
diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
index 6be2167..aeb0a15 100644
--- a/arch/s390/kernel/uv.c
+++ b/arch/s390/kernel/uv.c
@@ -358,6 +358,15 @@
 static struct kobj_attribute uv_query_facilities_attr =
 	__ATTR(facilities, 0444, uv_query_facilities, NULL);
 
+static ssize_t uv_query_feature_indications(struct kobject *kobj,
+					    struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%lx\n", uv_info.uv_feature_indications);
+}
+
+static struct kobj_attribute uv_query_feature_indications_attr =
+	__ATTR(feature_indications, 0444, uv_query_feature_indications, NULL);
+
 static ssize_t uv_query_max_guest_cpus(struct kobject *kobj,
 				       struct kobj_attribute *attr, char *page)
 {
@@ -390,6 +399,7 @@
 
 static struct attribute *uv_query_attrs[] = {
 	&uv_query_facilities_attr.attr,
+	&uv_query_feature_indications_attr.attr,
 	&uv_query_max_guest_cpus_attr.attr,
 	&uv_query_max_guest_vms_attr.attr,
 	&uv_query_max_guest_addr_attr.attr,
diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c
index 8c4e07d..9969426 100644
--- a/arch/s390/kernel/vdso.c
+++ b/arch/s390/kernel/vdso.c
@@ -20,7 +20,7 @@
 #include <asm/vdso.h>
 
 extern char vdso64_start[], vdso64_end[];
-static unsigned int vdso_pages;
+extern char vdso32_start[], vdso32_end[];
 
 static struct vm_special_mapping vvar_mapping;
 
@@ -37,18 +37,6 @@
 	VVAR_NR_PAGES,
 };
 
-unsigned int __read_mostly vdso_enabled = 1;
-
-static int __init vdso_setup(char *str)
-{
-	bool enabled;
-
-	if (!kstrtobool(str, &enabled))
-		vdso_enabled = enabled;
-	return 1;
-}
-__setup("vdso=", vdso_setup);
-
 #ifdef CONFIG_TIME_NS
 struct vdso_data *arch_get_vdso_data(void *vvar_page)
 {
@@ -155,7 +143,12 @@
 	.fault = vvar_fault,
 };
 
-static struct vm_special_mapping vdso_mapping = {
+static struct vm_special_mapping vdso64_mapping = {
+	.name = "[vdso]",
+	.mremap = vdso_mremap,
+};
+
+static struct vm_special_mapping vdso32_mapping = {
 	.name = "[vdso]",
 	.mremap = vdso_mremap,
 };
@@ -171,16 +164,22 @@
 {
 	unsigned long vdso_text_len, vdso_mapping_len;
 	unsigned long vvar_start, vdso_text_start;
+	struct vm_special_mapping *vdso_mapping;
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	int rc;
 
 	BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES);
-	if (!vdso_enabled || is_compat_task())
-		return 0;
 	if (mmap_write_lock_killable(mm))
 		return -EINTR;
-	vdso_text_len = vdso_pages << PAGE_SHIFT;
+
+	if (is_compat_task()) {
+		vdso_text_len = vdso32_end - vdso32_start;
+		vdso_mapping = &vdso32_mapping;
+	} else {
+		vdso_text_len = vdso64_end - vdso64_start;
+		vdso_mapping = &vdso64_mapping;
+	}
 	vdso_mapping_len = vdso_text_len + VVAR_NR_PAGES * PAGE_SIZE;
 	vvar_start = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0);
 	rc = vvar_start;
@@ -198,7 +197,7 @@
 	vma = _install_special_mapping(mm, vdso_text_start, vdso_text_len,
 				       VM_READ|VM_EXEC|
 				       VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
-				       &vdso_mapping);
+				       vdso_mapping);
 	if (IS_ERR(vma)) {
 		do_munmap(mm, vvar_start, PAGE_SIZE, NULL);
 		rc = PTR_ERR(vma);
@@ -211,21 +210,25 @@
 	return rc;
 }
 
-static int __init vdso_init(void)
+static struct page ** __init vdso_setup_pages(void *start, void *end)
 {
-	struct page **pages;
+	int pages = (end - start) >> PAGE_SHIFT;
+	struct page **pagelist;
 	int i;
 
-	vdso_pages = (vdso64_end - vdso64_start) >> PAGE_SHIFT;
-	pages = kcalloc(vdso_pages + 1, sizeof(struct page *), GFP_KERNEL);
-	if (!pages) {
-		vdso_enabled = 0;
-		return -ENOMEM;
-	}
-	for (i = 0; i < vdso_pages; i++)
-		pages[i] = virt_to_page(vdso64_start + i * PAGE_SIZE);
-	pages[vdso_pages] = NULL;
-	vdso_mapping.pages = pages;
+	pagelist = kcalloc(pages + 1, sizeof(struct page *), GFP_KERNEL);
+	if (!pagelist)
+		panic("%s: Cannot allocate page list for VDSO", __func__);
+	for (i = 0; i < pages; i++)
+		pagelist[i] = virt_to_page(start + i * PAGE_SIZE);
+	return pagelist;
+}
+
+static int __init vdso_init(void)
+{
+	vdso64_mapping.pages = vdso_setup_pages(vdso64_start, vdso64_end);
+	if (IS_ENABLED(CONFIG_COMPAT))
+		vdso32_mapping.pages = vdso_setup_pages(vdso32_start, vdso32_end);
 	return 0;
 }
 arch_initcall(vdso_init);
diff --git a/arch/s390/kernel/vdso32/.gitignore b/arch/s390/kernel/vdso32/.gitignore
new file mode 100644
index 0000000..5167384
--- /dev/null
+++ b/arch/s390/kernel/vdso32/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+vdso32.lds
diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile
new file mode 100644
index 0000000..b2349a3
--- /dev/null
+++ b/arch/s390/kernel/vdso32/Makefile
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: GPL-2.0
+# List of files in the vdso
+
+KCOV_INSTRUMENT := n
+ARCH_REL_TYPE_ABS := R_390_COPY|R_390_GLOB_DAT|R_390_JMP_SLOT|R_390_RELATIVE
+ARCH_REL_TYPE_ABS += R_390_GOT|R_390_PLT
+
+include $(srctree)/lib/vdso/Makefile
+obj-vdso32 = vdso_user_wrapper-32.o note-32.o
+
+# Build rules
+
+targets := $(obj-vdso32) vdso32.so vdso32.so.dbg
+obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32))
+
+KBUILD_AFLAGS += -DBUILD_VDSO
+KBUILD_CFLAGS += -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING
+
+KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS))
+KBUILD_AFLAGS_32 += -m31 -s
+
+KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS))
+KBUILD_CFLAGS_32 += -m31 -fPIC -shared -fno-common -fno-builtin
+
+LDFLAGS_vdso32.so.dbg += -fPIC -shared -nostdlib -soname=linux-vdso32.so.1 \
+	--hash-style=both --build-id=sha1 -melf_s390 -T
+
+$(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
+$(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
+
+obj-y += vdso32_wrapper.o
+CPPFLAGS_vdso32.lds += -P -C -U$(ARCH)
+
+# Disable gcov profiling, ubsan and kasan for VDSO code
+GCOV_PROFILE := n
+UBSAN_SANITIZE := n
+KASAN_SANITIZE := n
+
+# Force dependency (incbin is bad)
+$(obj)/vdso32_wrapper.o : $(obj)/vdso32.so
+
+$(obj)/vdso32.so.dbg: $(src)/vdso32.lds $(obj-vdso32) FORCE
+	$(call if_changed,ld)
+
+# strip rule for the .so file
+$(obj)/%.so: OBJCOPYFLAGS := -S
+$(obj)/%.so: $(obj)/%.so.dbg FORCE
+	$(call if_changed,objcopy)
+
+$(obj-vdso32): %-32.o: %.S FORCE
+	$(call if_changed_dep,vdso32as)
+
+# actual build commands
+quiet_cmd_vdso32as = VDSO32A $@
+      cmd_vdso32as = $(CC) $(a_flags) -c -o $@ $<
+quiet_cmd_vdso32cc = VDSO32C $@
+      cmd_vdso32cc = $(CC) $(c_flags) -c -o $@ $<
+
+# install commands for the unstripped file
+quiet_cmd_vdso_install = INSTALL $@
+      cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@
+
+vdso32.so: $(obj)/vdso32.so.dbg
+	@mkdir -p $(MODLIB)/vdso
+	$(call cmd,vdso_install)
+
+vdso_install: vdso32.so
+
+# Generate VDSO offsets using helper script
+gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh
+quiet_cmd_vdsosym = VDSOSYM $@
+	cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@
+
+include/generated/vdso32-offsets.h: $(obj)/vdso32.so.dbg FORCE
+	$(call if_changed,vdsosym)
diff --git a/arch/s390/kernel/vdso32/gen_vdso_offsets.sh b/arch/s390/kernel/vdso32/gen_vdso_offsets.sh
new file mode 100755
index 0000000..9c4f951
--- /dev/null
+++ b/arch/s390/kernel/vdso32/gen_vdso_offsets.sh
@@ -0,0 +1,15 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+#
+# Match symbols in the DSO that look like VDSO_*; produce a header file
+# of constant offsets into the shared object.
+#
+# Doing this inside the Makefile will break the $(filter-out) function,
+# causing Kbuild to rebuild the vdso-offsets header file every time.
+#
+# Inspired by arm64 version.
+#
+
+LC_ALL=C
+sed -n 's/\([0-9a-f]*\) . __kernel_compat_\(.*\)/\#define vdso32_offset_\2\t0x\1/p'
diff --git a/arch/s390/kernel/vdso32/note.S b/arch/s390/kernel/vdso32/note.S
new file mode 100644
index 0000000..db19d06
--- /dev/null
+++ b/arch/s390/kernel/vdso32/note.S
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
+ * Here we can supply some information useful to userland.
+ */
+
+#include <linux/uts.h>
+#include <linux/version.h>
+#include <linux/elfnote.h>
+
+ELFNOTE_START(Linux, 0, "a")
+	.long LINUX_VERSION_CODE
+ELFNOTE_END
diff --git a/arch/s390/kernel/vdso32/vdso32.lds.S b/arch/s390/kernel/vdso32/vdso32.lds.S
new file mode 100644
index 0000000..bff50b6a
--- /dev/null
+++ b/arch/s390/kernel/vdso32/vdso32.lds.S
@@ -0,0 +1,141 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This is the infamous ld script for the 64 bits vdso
+ * library
+ */
+
+#include <asm/page.h>
+#include <asm/vdso.h>
+
+OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390")
+OUTPUT_ARCH(s390:31-bit)
+ENTRY(_start)
+
+SECTIONS
+{
+	PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE);
+#ifdef CONFIG_TIME_NS
+	PROVIDE(_timens_data = _vdso_data + PAGE_SIZE);
+#endif
+	. = VDSO_LBASE + SIZEOF_HEADERS;
+
+	.hash		: { *(.hash) }			:text
+	.gnu.hash	: { *(.gnu.hash) }
+	.dynsym		: { *(.dynsym) }
+	.dynstr		: { *(.dynstr) }
+	.gnu.version	: { *(.gnu.version) }
+	.gnu.version_d	: { *(.gnu.version_d) }
+	.gnu.version_r	: { *(.gnu.version_r) }
+
+	.note		: { *(.note.*) }		:text	:note
+
+	. = ALIGN(16);
+	.text		: {
+		*(.text .stub .text.* .gnu.linkonce.t.*)
+	} :text
+	PROVIDE(__etext = .);
+	PROVIDE(_etext = .);
+	PROVIDE(etext = .);
+
+	/*
+	 * Other stuff is appended to the text segment:
+	 */
+	.rodata		: { *(.rodata .rodata.* .gnu.linkonce.r.*) }
+	.rodata1	: { *(.rodata1) }
+
+	.dynamic	: { *(.dynamic) }		:text	:dynamic
+
+	.eh_frame_hdr	: { *(.eh_frame_hdr) }		:text	:eh_frame_hdr
+	.eh_frame	: { KEEP (*(.eh_frame)) }	:text
+	.gcc_except_table : { *(.gcc_except_table .gcc_except_table.*) }
+
+	.rela.dyn ALIGN(8) : { *(.rela.dyn) }
+	.got ALIGN(8)	: { *(.got .toc) }
+
+	_end = .;
+	PROVIDE(end = .);
+
+	/*
+	 * Stabs debugging sections are here too.
+	 */
+	.stab	       0 : { *(.stab) }
+	.stabstr       0 : { *(.stabstr) }
+	.stab.excl     0 : { *(.stab.excl) }
+	.stab.exclstr  0 : { *(.stab.exclstr) }
+	.stab.index    0 : { *(.stab.index) }
+	.stab.indexstr 0 : { *(.stab.indexstr) }
+	.comment       0 : { *(.comment) }
+
+	/*
+	 * DWARF debug sections.
+	 * Symbols in the DWARF debugging sections are relative to the
+	 * beginning of the section so we begin them at 0.
+	 */
+	/* DWARF 1 */
+	.debug		0 : { *(.debug) }
+	.line		0 : { *(.line) }
+	/* GNU DWARF 1 extensions */
+	.debug_srcinfo	0 : { *(.debug_srcinfo) }
+	.debug_sfnames	0 : { *(.debug_sfnames) }
+	/* DWARF 1.1 and DWARF 2 */
+	.debug_aranges	0 : { *(.debug_aranges) }
+	.debug_pubnames 0 : { *(.debug_pubnames) }
+	/* DWARF 2 */
+	.debug_info	0 : { *(.debug_info .gnu.linkonce.wi.*) }
+	.debug_abbrev	0 : { *(.debug_abbrev) }
+	.debug_line	0 : { *(.debug_line) }
+	.debug_frame	0 : { *(.debug_frame) }
+	.debug_str	0 : { *(.debug_str) }
+	.debug_loc	0 : { *(.debug_loc) }
+	.debug_macinfo	0 : { *(.debug_macinfo) }
+	/* SGI/MIPS DWARF 2 extensions */
+	.debug_weaknames 0 : { *(.debug_weaknames) }
+	.debug_funcnames 0 : { *(.debug_funcnames) }
+	.debug_typenames 0 : { *(.debug_typenames) }
+	.debug_varnames  0 : { *(.debug_varnames) }
+	/* DWARF 3 */
+	.debug_pubtypes 0 : { *(.debug_pubtypes) }
+	.debug_ranges	0 : { *(.debug_ranges) }
+	.gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
+
+	/DISCARD/	: {
+		*(.note.GNU-stack)
+		*(.branch_lt)
+		*(.data .data.* .gnu.linkonce.d.* .sdata*)
+		*(.bss .sbss .dynbss .dynsbss)
+	}
+}
+
+/*
+ * Very old versions of ld do not recognize this name token; use the constant.
+ */
+#define PT_GNU_EH_FRAME	0x6474e550
+
+/*
+ * We must supply the ELF program headers explicitly to get just one
+ * PT_LOAD segment, and set the flags explicitly to make segments read-only.
+ */
+PHDRS
+{
+	text		PT_LOAD FILEHDR PHDRS FLAGS(5);	/* PF_R|PF_X */
+	dynamic		PT_DYNAMIC FLAGS(4);		/* PF_R */
+	note		PT_NOTE FLAGS(4);		/* PF_R */
+	eh_frame_hdr	PT_GNU_EH_FRAME;
+}
+
+/*
+ * This controls what symbols we export from the DSO.
+ */
+VERSION
+{
+	VDSO_VERSION_STRING {
+	global:
+		/*
+		 * Has to be there for the kernel to find
+		 */
+		__kernel_compat_restart_syscall;
+		__kernel_compat_rt_sigreturn;
+		__kernel_compat_sigreturn;
+	local: *;
+	};
+}
diff --git a/arch/s390/kernel/vdso32/vdso32_wrapper.S b/arch/s390/kernel/vdso32/vdso32_wrapper.S
new file mode 100644
index 0000000..de2fb93
--- /dev/null
+++ b/arch/s390/kernel/vdso32/vdso32_wrapper.S
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+	__PAGE_ALIGNED_DATA
+
+	.globl vdso32_start, vdso32_end
+	.balign PAGE_SIZE
+vdso32_start:
+	.incbin "arch/s390/kernel/vdso32/vdso32.so"
+	.balign PAGE_SIZE
+vdso32_end:
+
+	.previous
diff --git a/arch/s390/kernel/vdso32/vdso_user_wrapper.S b/arch/s390/kernel/vdso32/vdso_user_wrapper.S
new file mode 100644
index 0000000..3f42f27
--- /dev/null
+++ b/arch/s390/kernel/vdso32/vdso_user_wrapper.S
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <asm/unistd.h>
+#include <asm/dwarf.h>
+
+.macro vdso_syscall func,syscall
+	.globl __kernel_compat_\func
+	.type  __kernel_compat_\func,@function
+	.align 8
+__kernel_compat_\func:
+	CFI_STARTPROC
+	svc	\syscall
+	/* Make sure we notice when a syscall returns, which shouldn't happen */
+	.word	0
+	CFI_ENDPROC
+	.size	__kernel_compat_\func,.-__kernel_compat_\func
+.endm
+
+vdso_syscall restart_syscall,__NR_restart_syscall
+vdso_syscall sigreturn,__NR_sigreturn
+vdso_syscall rt_sigreturn,__NR_rt_sigreturn
diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile
index a6e0fb6..2a2092c 100644
--- a/arch/s390/kernel/vdso64/Makefile
+++ b/arch/s390/kernel/vdso64/Makefile
@@ -74,3 +74,11 @@
 	$(call cmd,vdso_install)
 
 vdso_install: vdso64.so
+
+# Generate VDSO offsets using helper script
+gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh
+quiet_cmd_vdsosym = VDSOSYM $@
+	cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@
+
+include/generated/vdso64-offsets.h: $(obj)/vdso64.so.dbg FORCE
+	$(call if_changed,vdsosym)
diff --git a/arch/s390/kernel/vdso64/gen_vdso_offsets.sh b/arch/s390/kernel/vdso64/gen_vdso_offsets.sh
new file mode 100755
index 0000000..37f05cb3
--- /dev/null
+++ b/arch/s390/kernel/vdso64/gen_vdso_offsets.sh
@@ -0,0 +1,15 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+#
+# Match symbols in the DSO that look like VDSO_*; produce a header file
+# of constant offsets into the shared object.
+#
+# Doing this inside the Makefile will break the $(filter-out) function,
+# causing Kbuild to rebuild the vdso-offsets header file every time.
+#
+# Inspired by arm64 version.
+#
+
+LC_ALL=C
+sed -n 's/\([0-9a-f]*\) . __kernel_\(.*\)/\#define vdso64_offset_\2\t0x\1/p'
diff --git a/arch/s390/kernel/vdso64/vdso64.lds.S b/arch/s390/kernel/vdso64/vdso64.lds.S
index 518f1ea4..d4fb336 100644
--- a/arch/s390/kernel/vdso64/vdso64.lds.S
+++ b/arch/s390/kernel/vdso64/vdso64.lds.S
@@ -17,7 +17,7 @@
 #ifdef CONFIG_TIME_NS
 	PROVIDE(_timens_data = _vdso_data + PAGE_SIZE);
 #endif
-	. = VDSO64_LBASE + SIZEOF_HEADERS;
+	. = VDSO_LBASE + SIZEOF_HEADERS;
 
 	.hash		: { *(.hash) }			:text
 	.gnu.hash	: { *(.gnu.hash) }
@@ -137,6 +137,9 @@
 		__kernel_clock_gettime;
 		__kernel_clock_getres;
 		__kernel_getcpu;
+		__kernel_restart_syscall;
+		__kernel_rt_sigreturn;
+		__kernel_sigreturn;
 	local: *;
 	};
 }
diff --git a/arch/s390/kernel/vdso64/vdso_user_wrapper.S b/arch/s390/kernel/vdso64/vdso_user_wrapper.S
index f773505..97f0c0a 100644
--- a/arch/s390/kernel/vdso64/vdso_user_wrapper.S
+++ b/arch/s390/kernel/vdso64/vdso_user_wrapper.S
@@ -37,3 +37,20 @@
 vdso_func clock_getres
 vdso_func clock_gettime
 vdso_func getcpu
+
+.macro vdso_syscall func,syscall
+	.globl __kernel_\func
+	.type  __kernel_\func,@function
+	.align 8
+__kernel_\func:
+	CFI_STARTPROC
+	svc	\syscall
+	/* Make sure we notice when a syscall returns, which shouldn't happen */
+	.word	0
+	CFI_ENDPROC
+	.size	__kernel_\func,.-__kernel_\func
+.endm
+
+vdso_syscall restart_syscall,__NR_restart_syscall
+vdso_syscall sigreturn,__NR_sigreturn
+vdso_syscall rt_sigreturn,__NR_rt_sigreturn
diff --git a/arch/s390/lib/string.c b/arch/s390/lib/string.c
index ec5b76b..cfcdf76 100644
--- a/arch/s390/lib/string.c
+++ b/arch/s390/lib/string.c
@@ -162,7 +162,7 @@
 		"	jo	0b\n"
 		"1:	mvst	%[dummy],%[src]\n"
 		"	jo	1b\n"
-		: [dummy] "=&a" (dummy), [dest] "+&a" (dest), [src] "+&a" (src)
+		: [dummy] "+&a" (dummy), [dest] "+&a" (dest), [src] "+&a" (src)
 		:
 		: "cc", "memory", "0");
 	return ret;
diff --git a/arch/s390/lib/test_unwind.c b/arch/s390/lib/test_unwind.c
index 2f32802..ecf327d 100644
--- a/arch/s390/lib/test_unwind.c
+++ b/arch/s390/lib/test_unwind.c
@@ -120,7 +120,7 @@
 #define UWM_REGS		0x2	/* Pass regs to test_unwind(). */
 #define UWM_SP			0x4	/* Pass sp to test_unwind(). */
 #define UWM_CALLER		0x8	/* Unwind starting from caller. */
-#define UWM_SWITCH_STACK	0x10	/* Use CALL_ON_STACK. */
+#define UWM_SWITCH_STACK	0x10	/* Use call_on_stack. */
 #define UWM_IRQ			0x20	/* Unwind from irq context. */
 #define UWM_PGM			0x40	/* Unwind from program check handler. */
 
@@ -211,7 +211,8 @@
 	if (u->flags & UWM_SWITCH_STACK) {
 		local_irq_save(flags);
 		local_mcck_disable();
-		rc = CALL_ON_STACK(unwindme_func3, S390_lowcore.nodat_stack, 1, u);
+		rc = call_on_stack(1, S390_lowcore.nodat_stack,
+				   int, unwindme_func3, struct unwindme *, u);
 		local_mcck_enable();
 		local_irq_restore(flags);
 		return rc;
diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c
index 67606d9..7ec8b1f 100644
--- a/arch/s390/lib/uaccess.c
+++ b/arch/s390/lib/uaccess.c
@@ -224,7 +224,7 @@
 		EX_TABLE(0b,3b)
 		: "+a" (size), "+a" (to), "+a" (from), "+a" (tmp1), "=a" (tmp2)
 		: [spec] "d" (0x810081UL)
-		: "cc", "memory");
+		: "cc", "memory", "0");
 	return size;
 }
 
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 8ae3dc5..e33c43b 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -285,26 +285,6 @@
 			(void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
 }
 
-static noinline int signal_return(struct pt_regs *regs)
-{
-	u16 instruction;
-	int rc;
-
-	rc = __get_user(instruction, (u16 __user *) regs->psw.addr);
-	if (rc)
-		return rc;
-	if (instruction == 0x0a77) {
-		set_pt_regs_flag(regs, PIF_SYSCALL);
-		regs->int_code = 0x00040077;
-		return 0;
-	} else if (instruction == 0x0aad) {
-		set_pt_regs_flag(regs, PIF_SYSCALL);
-		regs->int_code = 0x000400ad;
-		return 0;
-	}
-	return -EACCES;
-}
-
 static noinline void do_fault_error(struct pt_regs *regs, int access,
 					vm_fault_t fault)
 {
@@ -312,9 +292,6 @@
 
 	switch (fault) {
 	case VM_FAULT_BADACCESS:
-		if (access == VM_EXEC && signal_return(regs) == 0)
-			break;
-		fallthrough;
 	case VM_FAULT_BADMAP:
 		/* Bad memory access. Check if it is kernel or user space. */
 		if (user_mode(regs)) {
@@ -792,6 +769,32 @@
 	struct page *page;
 	int rc;
 
+	/*
+	 * bit 61 tells us if the address is valid, if it's not we
+	 * have a major problem and should stop the kernel or send a
+	 * SIGSEGV to the process. Unfortunately bit 61 is not
+	 * reliable without the misc UV feature so we need to check
+	 * for that as well.
+	 */
+	if (test_bit_inv(BIT_UV_FEAT_MISC, &uv_info.uv_feature_indications) &&
+	    !test_bit_inv(61, &regs->int_parm_long)) {
+		/*
+		 * When this happens, userspace did something that it
+		 * was not supposed to do, e.g. branching into secure
+		 * memory. Trigger a segmentation fault.
+		 */
+		if (user_mode(regs)) {
+			send_sig(SIGSEGV, current, 0);
+			return;
+		}
+
+		/*
+		 * The kernel should never run into this case and we
+		 * have no way out of this situation.
+		 */
+		panic("Unexpected PGM 0x3d with TEID bit 61=0");
+	}
+
 	switch (get_fault_type(regs)) {
 	case USER_FAULT:
 		mm = current->mm;
diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c
index 1f1f906..a0f54bd 100644
--- a/arch/s390/mm/maccess.c
+++ b/arch/s390/mm/maccess.c
@@ -125,12 +125,18 @@
  */
 int memcpy_real(void *dest, void *src, size_t count)
 {
+	unsigned long _dest  = (unsigned long)dest;
+	unsigned long _src   = (unsigned long)src;
+	unsigned long _count = (unsigned long)count;
 	int rc;
 
 	if (S390_lowcore.nodat_stack != 0) {
 		preempt_disable();
-		rc = CALL_ON_STACK(_memcpy_real, S390_lowcore.nodat_stack, 3,
-				   dest, src, count);
+		rc = call_on_stack(3, S390_lowcore.nodat_stack,
+				   unsigned long, _memcpy_real,
+				   unsigned long, _dest,
+				   unsigned long, _src,
+				   unsigned long, _count);
 		preempt_enable();
 		return rc;
 	}
@@ -139,8 +145,7 @@
 	 * not set up yet. Just call _memcpy_real on the early boot
 	 * stack
 	 */
-	return _memcpy_real((unsigned long) dest,(unsigned long) src,
-			    (unsigned long) count);
+	return _memcpy_real(_dest, _src, _count);
 }
 
 /*
diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c
index d256018..8d3a1d8 100644
--- a/drivers/s390/crypto/ap_bus.c
+++ b/drivers/s390/crypto/ap_bus.c
@@ -61,6 +61,9 @@
 module_param_named(aqmask, aqm_str, charp, 0440);
 MODULE_PARM_DESC(aqmask, "AP bus domain mask.");
 
+atomic_t ap_max_msg_size = ATOMIC_INIT(AP_DEFAULT_MAX_MSG_SIZE);
+EXPORT_SYMBOL(ap_max_msg_size);
+
 static struct device *ap_root_device;
 
 /* Hashtable of all queue devices on the AP bus */
@@ -316,11 +319,24 @@
  * Returns true if TAPQ succeeded and the info is filled or
  * false otherwise.
  */
-static bool ap_queue_info(ap_qid_t qid, int *q_type,
-			  unsigned int *q_fac, int *q_depth, bool *q_decfg)
+static bool ap_queue_info(ap_qid_t qid, int *q_type, unsigned int *q_fac,
+			  int *q_depth, int *q_ml, bool *q_decfg)
 {
 	struct ap_queue_status status;
-	unsigned long info = 0;
+	union {
+		unsigned long value;
+		struct {
+			unsigned int fac   : 32; /* facility bits */
+			unsigned int at	   :  8; /* ap type */
+			unsigned int _res1 :  8;
+			unsigned int _res2 :  4;
+			unsigned int ml	   :  4; /* apxl ml */
+			unsigned int _res3 :  4;
+			unsigned int qd	   :  4; /* queue depth */
+		} tapq_gr2;
+	} tapq_info;
+
+	tapq_info.value = 0;
 
 	/* make sure we don't run into a specifiation exception */
 	if (AP_QID_CARD(qid) > ap_max_adapter_id ||
@@ -328,7 +344,7 @@
 		return false;
 
 	/* call TAPQ on this APQN */
-	status = ap_test_queue(qid, ap_apft_available(), &info);
+	status = ap_test_queue(qid, ap_apft_available(), &tapq_info.value);
 	switch (status.response_code) {
 	case AP_RESPONSE_NORMAL:
 	case AP_RESPONSE_RESET_IN_PROGRESS:
@@ -340,11 +356,12 @@
 		 * info should be filled. All bits 0 is not possible as
 		 * there is at least one of the mode bits set.
 		 */
-		if (WARN_ON_ONCE(!info))
+		if (WARN_ON_ONCE(!tapq_info.value))
 			return false;
-		*q_type = (int)((info >> 24) & 0xff);
-		*q_fac = (unsigned int)(info >> 32);
-		*q_depth = (int)(info & 0xff);
+		*q_type = tapq_info.tapq_gr2.at;
+		*q_fac = tapq_info.tapq_gr2.fac;
+		*q_depth = tapq_info.tapq_gr2.qd;
+		*q_ml = tapq_info.tapq_gr2.ml;
 		*q_decfg = status.response_code == AP_RESPONSE_DECONFIGURED;
 		switch (*q_type) {
 			/* For CEX2 and CEX3 the available functions
@@ -1516,7 +1533,7 @@
 	unsigned int func;
 	struct device *dev;
 	struct ap_queue *aq;
-	int rc, dom, depth, type;
+	int rc, dom, depth, type, ml;
 
 	/*
 	 * Go through the configuration for the domains and compare them
@@ -1540,7 +1557,7 @@
 			continue;
 		}
 		/* domain is valid, get info from this APQN */
-		if (!ap_queue_info(qid, &type, &func, &depth, &decfg)) {
+		if (!ap_queue_info(qid, &type, &func, &depth, &ml, &decfg)) {
 			if (aq) {
 				AP_DBF_INFO(
 					"%s(%d,%d) ap_queue_info() not successful, rm queue device\n",
@@ -1639,7 +1656,7 @@
 	unsigned int func;
 	struct device *dev;
 	struct ap_card *ac;
-	int rc, dom, depth, type, comp_type;
+	int rc, dom, depth, type, comp_type, ml;
 
 	/* Is there currently a card device for this adapter ? */
 	dev = bus_find_device(&ap_bus_type, NULL,
@@ -1668,7 +1685,8 @@
 	for (dom = 0; dom <= ap_max_domain_id; dom++)
 		if (ap_test_config_usage_domain(dom)) {
 			qid = AP_MKQID(ap, dom);
-			if (ap_queue_info(qid, &type, &func, &depth, &decfg))
+			if (ap_queue_info(qid, &type, &func,
+					  &depth, &ml, &decfg))
 				break;
 		}
 	if (dom > ap_max_domain_id) {
@@ -1737,7 +1755,7 @@
 				    __func__, ap, type);
 			return;
 		}
-		ac = ap_card_create(ap, depth, type, comp_type, func);
+		ac = ap_card_create(ap, depth, type, comp_type, func, ml);
 		if (!ac) {
 			AP_DBF_WARN("%s(%d) ap_card_create() failed\n",
 				    __func__, ap);
@@ -1748,6 +1766,12 @@
 		dev->bus = &ap_bus_type;
 		dev->parent = ap_root_device;
 		dev_set_name(dev, "card%02x", ap);
+		/* maybe enlarge ap_max_msg_size to support this card */
+		if (ac->maxmsgsize > atomic_read(&ap_max_msg_size)) {
+			atomic_set(&ap_max_msg_size, ac->maxmsgsize);
+			AP_DBF_INFO("%s(%d) ap_max_msg_size update to %d byte\n",
+				    __func__, ap, atomic_read(&ap_max_msg_size));
+		}
 		/* Register the new card device with AP bus */
 		rc = device_register(dev);
 		if (rc) {
diff --git a/drivers/s390/crypto/ap_bus.h b/drivers/s390/crypto/ap_bus.h
index 230fec6..8f18abd 100644
--- a/drivers/s390/crypto/ap_bus.h
+++ b/drivers/s390/crypto/ap_bus.h
@@ -25,8 +25,11 @@
 #define AP_RESET_TIMEOUT (HZ*0.7)	/* Time in ticks for reset timeouts. */
 #define AP_CONFIG_TIME 30	/* Time in seconds between AP bus rescans. */
 #define AP_POLL_TIME 1		/* Time in ticks between receive polls. */
+#define AP_DEFAULT_MAX_MSG_SIZE (12 * 1024)
+#define AP_TAPQ_ML_FIELD_CHUNK_SIZE (4096)
 
 extern int ap_domain_index;
+extern atomic_t ap_max_msg_size;
 
 extern DECLARE_HASHTABLE(ap_queues, 8);
 extern spinlock_t ap_queues_lock;
@@ -167,6 +170,7 @@
 	unsigned int functions;		/* AP device function bitfield. */
 	int queue_depth;		/* AP queue depth.*/
 	int id;				/* AP card number. */
+	unsigned int maxmsgsize;	/* AP msg limit for this card */
 	bool config;			/* configured state */
 	atomic64_t total_request_count;	/* # requests ever for this AP device.*/
 };
@@ -228,7 +232,8 @@
 	struct list_head list;		/* Request queueing. */
 	unsigned long long psmid;	/* Message id. */
 	void *msg;			/* Pointer to message buffer. */
-	unsigned int len;		/* Message length. */
+	unsigned int len;		/* actual msg len in msg buffer */
+	unsigned int bufsize;		/* allocated msg buffer size */
 	u16 flags;			/* Flags, see AP_MSG_FLAG_xxx */
 	struct ap_fi fi;		/* Failure Injection cmd */
 	int rc;				/* Return code for this message */
@@ -290,8 +295,8 @@
 void ap_queue_remove(struct ap_queue *aq);
 void ap_queue_init_state(struct ap_queue *aq);
 
-struct ap_card *ap_card_create(int id, int queue_depth, int raw_device_type,
-			       int comp_device_type, unsigned int functions);
+struct ap_card *ap_card_create(int id, int queue_depth, int raw_type,
+			       int comp_type, unsigned int functions, int ml);
 
 struct ap_perms {
 	unsigned long ioctlm[BITS_TO_LONGS(AP_IOCTLS)];
diff --git a/drivers/s390/crypto/ap_card.c b/drivers/s390/crypto/ap_card.c
index ca9afc5..196325a 100644
--- a/drivers/s390/crypto/ap_card.c
+++ b/drivers/s390/crypto/ap_card.c
@@ -174,6 +174,16 @@
 
 static DEVICE_ATTR_RW(config);
 
+static ssize_t max_msg_size_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct ap_card *ac = to_ap_card(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", ac->maxmsgsize);
+}
+
+static DEVICE_ATTR_RO(max_msg_size);
+
 static struct attribute *ap_card_dev_attrs[] = {
 	&dev_attr_hwtype.attr,
 	&dev_attr_raw_hwtype.attr,
@@ -184,6 +194,7 @@
 	&dev_attr_pendingq_count.attr,
 	&dev_attr_modalias.attr,
 	&dev_attr_config.attr,
+	&dev_attr_max_msg_size.attr,
 	NULL
 };
 
@@ -209,7 +220,7 @@
 }
 
 struct ap_card *ap_card_create(int id, int queue_depth, int raw_type,
-			       int comp_type, unsigned int functions)
+			       int comp_type, unsigned int functions, int ml)
 {
 	struct ap_card *ac;
 
@@ -223,5 +234,8 @@
 	ac->queue_depth = queue_depth;
 	ac->functions = functions;
 	ac->id = id;
+	ac->maxmsgsize = ml > 0 ?
+		ml * AP_TAPQ_ML_FIELD_CHUNK_SIZE : AP_DEFAULT_MAX_MSG_SIZE;
+
 	return ac;
 }
diff --git a/drivers/s390/crypto/ap_queue.c b/drivers/s390/crypto/ap_queue.c
index 337353c..669f96f 100644
--- a/drivers/s390/crypto/ap_queue.c
+++ b/drivers/s390/crypto/ap_queue.c
@@ -101,7 +101,7 @@
 
 	if (msg == NULL)
 		return -EINVAL;
-	status = ap_dqap(qid, psmid, msg, length);
+	status = ap_dqap(qid, psmid, msg, length, NULL, NULL);
 	switch (status.response_code) {
 	case AP_RESPONSE_NORMAL:
 		return 0;
@@ -136,9 +136,24 @@
 	struct ap_queue_status status;
 	struct ap_message *ap_msg;
 	bool found = false;
+	size_t reslen;
+	unsigned long resgr0 = 0;
+	int parts = 0;
 
-	status = ap_dqap(aq->qid, &aq->reply->psmid,
-			 aq->reply->msg, aq->reply->len);
+	/*
+	 * DQAP loop until response code and resgr0 indicate that
+	 * the msg is totally received. As we use the very same buffer
+	 * the msg is overwritten with each invocation. That's intended
+	 * and the receiver of the msg is informed with a msg rc code
+	 * of EMSGSIZE in such a case.
+	 */
+	do {
+		status = ap_dqap(aq->qid, &aq->reply->psmid,
+				 aq->reply->msg, aq->reply->bufsize,
+				 &reslen, &resgr0);
+		parts++;
+	} while (status.response_code == 0xFF && resgr0 != 0);
+
 	switch (status.response_code) {
 	case AP_RESPONSE_NORMAL:
 		aq->queue_count = max_t(int, 0, aq->queue_count - 1);
@@ -150,7 +165,12 @@
 				continue;
 			list_del_init(&ap_msg->list);
 			aq->pendingq_count--;
-			ap_msg->receive(aq, ap_msg, aq->reply);
+			if (parts > 1) {
+				ap_msg->rc = -EMSGSIZE;
+				ap_msg->receive(aq, ap_msg, NULL);
+			} else {
+				ap_msg->receive(aq, ap_msg, aq->reply);
+			}
 			found = true;
 			break;
 		}
diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c
index 5d726cd..529ffe26 100644
--- a/drivers/s390/crypto/zcrypt_api.c
+++ b/drivers/s390/crypto/zcrypt_api.c
@@ -900,6 +900,9 @@
 		if (xcRB->user_defined != AUTOSELECT &&
 		    xcRB->user_defined != zc->card->id)
 			continue;
+		/* check if request size exceeds card max msg size */
+		if (ap_msg.len > zc->card->maxmsgsize)
+			continue;
 		/* check if device node has admission for this card */
 		if (!zcrypt_check_card(perms, zc->card->id))
 			continue;
@@ -1068,6 +1071,9 @@
 		if (targets &&
 		    !is_desired_ep11_card(zc->card->id, target_num, targets))
 			continue;
+		/* check if request size exceeds card max msg size */
+		if (ap_msg.len > zc->card->maxmsgsize)
+			continue;
 		/* check if device node has admission for this card */
 		if (!zcrypt_check_card(perms, zc->card->id))
 			continue;
diff --git a/drivers/s390/crypto/zcrypt_cex4.c b/drivers/s390/crypto/zcrypt_cex4.c
index f4a6d37..f518b5f 100644
--- a/drivers/s390/crypto/zcrypt_cex4.c
+++ b/drivers/s390/crypto/zcrypt_cex4.c
@@ -28,9 +28,6 @@
 #define CEX4C_MIN_MOD_SIZE	 16	/*  256 bits	*/
 #define CEX4C_MAX_MOD_SIZE	512	/* 4096 bits	*/
 
-#define CEX4A_MAX_MESSAGE_SIZE	MSGTYPE50_CRB3_MAX_MSG_SIZE
-#define CEX4C_MAX_MESSAGE_SIZE	MSGTYPE06_MAX_MSG_SIZE
-
 /* Waiting time for requests to be processed.
  * Currently there are some types of request which are not deterministic.
  * But the maximum time limit managed by the stomper code is set to 60sec.
@@ -605,19 +602,19 @@
 	int rc;
 
 	if (ap_test_bit(&aq->card->functions, AP_FUNC_ACCEL)) {
-		zq = zcrypt_queue_alloc(CEX4A_MAX_MESSAGE_SIZE);
+		zq = zcrypt_queue_alloc(aq->card->maxmsgsize);
 		if (!zq)
 			return -ENOMEM;
 		zq->ops = zcrypt_msgtype(MSGTYPE50_NAME,
 					 MSGTYPE50_VARIANT_DEFAULT);
 	} else if (ap_test_bit(&aq->card->functions, AP_FUNC_COPRO)) {
-		zq = zcrypt_queue_alloc(CEX4C_MAX_MESSAGE_SIZE);
+		zq = zcrypt_queue_alloc(aq->card->maxmsgsize);
 		if (!zq)
 			return -ENOMEM;
 		zq->ops = zcrypt_msgtype(MSGTYPE06_NAME,
 					 MSGTYPE06_VARIANT_DEFAULT);
 	} else if (ap_test_bit(&aq->card->functions, AP_FUNC_EP11)) {
-		zq = zcrypt_queue_alloc(CEX4C_MAX_MESSAGE_SIZE);
+		zq = zcrypt_queue_alloc(aq->card->maxmsgsize);
 		if (!zq)
 			return -ENOMEM;
 		zq->ops = zcrypt_msgtype(MSGTYPE06_NAME,
diff --git a/drivers/s390/crypto/zcrypt_msgtype50.c b/drivers/s390/crypto/zcrypt_msgtype50.c
index 6d1800c..9940547 100644
--- a/drivers/s390/crypto/zcrypt_msgtype50.c
+++ b/drivers/s390/crypto/zcrypt_msgtype50.c
@@ -442,11 +442,13 @@
 		goto out;	/* ap_msg->rc indicates the error */
 	t80h = reply->msg;
 	if (t80h->type == TYPE80_RSP_CODE) {
-		if (aq->ap_dev.device_type == AP_DEVICE_TYPE_CEX2A)
-			len = min_t(int, CEX2A_MAX_RESPONSE_SIZE, t80h->len);
-		else
-			len = min_t(int, CEX3A_MAX_RESPONSE_SIZE, t80h->len);
-		memcpy(msg->msg, reply->msg, len);
+		len = t80h->len;
+		if (len > reply->bufsize || len > msg->bufsize) {
+			msg->rc = -EMSGSIZE;
+		} else {
+			memcpy(msg->msg, reply->msg, len);
+			msg->len = len;
+		}
 	} else
 		memcpy(msg->msg, reply->msg, sizeof(error_reply));
 out:
@@ -469,10 +471,9 @@
 	struct completion work;
 	int rc;
 
-	if (zq->zcard->user_space_type == ZCRYPT_CEX2A)
-		ap_msg->msg = kmalloc(MSGTYPE50_CRB2_MAX_MSG_SIZE, GFP_KERNEL);
-	else
-		ap_msg->msg = kmalloc(MSGTYPE50_CRB3_MAX_MSG_SIZE, GFP_KERNEL);
+	ap_msg->bufsize = (zq->zcard->user_space_type == ZCRYPT_CEX2A) ?
+		MSGTYPE50_CRB2_MAX_MSG_SIZE : MSGTYPE50_CRB3_MAX_MSG_SIZE;
+	ap_msg->msg = kmalloc(ap_msg->bufsize, GFP_KERNEL);
 	if (!ap_msg->msg)
 		return -ENOMEM;
 	ap_msg->receive = zcrypt_cex2a_receive;
@@ -515,10 +516,9 @@
 	struct completion work;
 	int rc;
 
-	if (zq->zcard->user_space_type == ZCRYPT_CEX2A)
-		ap_msg->msg = kmalloc(MSGTYPE50_CRB2_MAX_MSG_SIZE, GFP_KERNEL);
-	else
-		ap_msg->msg = kmalloc(MSGTYPE50_CRB3_MAX_MSG_SIZE, GFP_KERNEL);
+	ap_msg->bufsize = (zq->zcard->user_space_type == ZCRYPT_CEX2A) ?
+		MSGTYPE50_CRB2_MAX_MSG_SIZE : MSGTYPE50_CRB3_MAX_MSG_SIZE;
+	ap_msg->msg = kmalloc(ap_msg->bufsize, GFP_KERNEL);
 	if (!ap_msg->msg)
 		return -ENOMEM;
 	ap_msg->receive = zcrypt_cex2a_receive;
diff --git a/drivers/s390/crypto/zcrypt_msgtype6.c b/drivers/s390/crypto/zcrypt_msgtype6.c
index da6b2bf7..752c639 100644
--- a/drivers/s390/crypto/zcrypt_msgtype6.c
+++ b/drivers/s390/crypto/zcrypt_msgtype6.c
@@ -403,7 +403,7 @@
 	} __packed * msg = ap_msg->msg;
 
 	int rcblen = CEIL4(xcRB->request_control_blk_length);
-	int replylen, req_sumlen, resp_sumlen;
+	int req_sumlen, resp_sumlen;
 	char *req_data = ap_msg->msg + sizeof(struct type6_hdr) + rcblen;
 	char *function_code;
 
@@ -415,7 +415,7 @@
 	ap_msg->len = sizeof(struct type6_hdr) +
 		CEIL4(xcRB->request_control_blk_length) +
 		xcRB->request_data_length;
-	if (ap_msg->len > MSGTYPE06_MAX_MSG_SIZE)
+	if (ap_msg->len > ap_msg->bufsize)
 		return -EINVAL;
 
 	/*
@@ -435,12 +435,6 @@
 			xcRB->reply_control_blk_length)
 		return -EINVAL; /* overflow after alignment*/
 
-	replylen = sizeof(struct type86_fmt2_msg) +
-		CEIL4(xcRB->reply_control_blk_length) +
-		xcRB->reply_data_length;
-	if (replylen > MSGTYPE06_MAX_MSG_SIZE)
-		return -EINVAL;
-
 	/*
 	 * Overflow check
 	 * sum must be greater (or equal) than the largest operand
@@ -530,18 +524,13 @@
 		return -EINVAL; /* overflow after alignment*/
 
 	/* length checks */
-	ap_msg->len = sizeof(struct type6_hdr) + xcRB->req_len;
-	if (CEIL4(xcRB->req_len) > MSGTYPE06_MAX_MSG_SIZE -
-				   (sizeof(struct type6_hdr)))
+	ap_msg->len = sizeof(struct type6_hdr) + CEIL4(xcRB->req_len);
+	if (ap_msg->len > ap_msg->bufsize)
 		return -EINVAL;
 
 	if (CEIL4(xcRB->resp_len) < xcRB->resp_len)
 		return -EINVAL; /* overflow after alignment*/
 
-	if (CEIL4(xcRB->resp_len) > MSGTYPE06_MAX_MSG_SIZE -
-				    (sizeof(struct type86_fmt2_msg)))
-		return -EINVAL;
-
 	/* prepare type6 header */
 	msg->hdr = static_type6_ep11_hdr;
 	msg->hdr.ToCardLen1   = xcRB->req_len;
@@ -952,13 +941,21 @@
 		switch (resp_type->type) {
 		case CEXXC_RESPONSE_TYPE_ICA:
 			len = sizeof(struct type86x_reply) + t86r->length - 2;
-			len = min_t(int, CEXXC_MAX_ICA_RESPONSE_SIZE, len);
-			memcpy(msg->msg, reply->msg, len);
+			if (len > reply->bufsize || len > msg->bufsize) {
+				msg->rc = -EMSGSIZE;
+			} else {
+				memcpy(msg->msg, reply->msg, len);
+				msg->len = len;
+			}
 			break;
 		case CEXXC_RESPONSE_TYPE_XCRB:
 			len = t86r->fmt2.offset2 + t86r->fmt2.count2;
-			len = min_t(int, MSGTYPE06_MAX_MSG_SIZE, len);
-			memcpy(msg->msg, reply->msg, len);
+			if (len > reply->bufsize || len > msg->bufsize) {
+				msg->rc = -EMSGSIZE;
+			} else {
+				memcpy(msg->msg, reply->msg, len);
+				msg->len = len;
+			}
 			break;
 		default:
 			memcpy(msg->msg, &error_reply, sizeof(error_reply));
@@ -999,8 +996,12 @@
 		switch (resp_type->type) {
 		case CEXXC_RESPONSE_TYPE_EP11:
 			len = t86r->fmt2.offset1 + t86r->fmt2.count1;
-			len = min_t(int, MSGTYPE06_MAX_MSG_SIZE, len);
-			memcpy(msg->msg, reply->msg, len);
+			if (len > reply->bufsize || len > msg->bufsize) {
+				msg->rc = -EMSGSIZE;
+			} else {
+				memcpy(msg->msg, reply->msg, len);
+				msg->len = len;
+			}
 			break;
 		default:
 			memcpy(msg->msg, &error_reply, sizeof(error_reply));
@@ -1033,6 +1034,7 @@
 	ap_msg->msg = (void *) get_zeroed_page(GFP_KERNEL);
 	if (!ap_msg->msg)
 		return -ENOMEM;
+	ap_msg->bufsize = PAGE_SIZE;
 	ap_msg->receive = zcrypt_msgtype6_receive;
 	ap_msg->psmid = (((unsigned long long) current->pid) << 32) +
 		atomic_inc_return(&zcrypt_step);
@@ -1080,6 +1082,7 @@
 	ap_msg->msg = (void *) get_zeroed_page(GFP_KERNEL);
 	if (!ap_msg->msg)
 		return -ENOMEM;
+	ap_msg->bufsize = PAGE_SIZE;
 	ap_msg->receive = zcrypt_msgtype6_receive;
 	ap_msg->psmid = (((unsigned long long) current->pid) << 32) +
 		atomic_inc_return(&zcrypt_step);
@@ -1124,7 +1127,8 @@
 		.type = CEXXC_RESPONSE_TYPE_XCRB,
 	};
 
-	ap_msg->msg = kmalloc(MSGTYPE06_MAX_MSG_SIZE, GFP_KERNEL);
+	ap_msg->bufsize = atomic_read(&ap_max_msg_size);
+	ap_msg->msg = kmalloc(ap_msg->bufsize, GFP_KERNEL);
 	if (!ap_msg->msg)
 		return -ENOMEM;
 	ap_msg->receive = zcrypt_msgtype6_receive;
@@ -1181,7 +1185,8 @@
 		.type = CEXXC_RESPONSE_TYPE_EP11,
 	};
 
-	ap_msg->msg = kmalloc(MSGTYPE06_MAX_MSG_SIZE, GFP_KERNEL);
+	ap_msg->bufsize = atomic_read(&ap_max_msg_size);
+	ap_msg->msg = kmalloc(ap_msg->bufsize, GFP_KERNEL);
 	if (!ap_msg->msg)
 		return -ENOMEM;
 	ap_msg->receive = zcrypt_msgtype6_receive_ep11;
@@ -1277,7 +1282,8 @@
 		.type = CEXXC_RESPONSE_TYPE_XCRB,
 	};
 
-	ap_msg->msg = kmalloc(MSGTYPE06_MAX_MSG_SIZE, GFP_KERNEL);
+	ap_msg->bufsize = AP_DEFAULT_MAX_MSG_SIZE;
+	ap_msg->msg = kmalloc(ap_msg->bufsize, GFP_KERNEL);
 	if (!ap_msg->msg)
 		return -ENOMEM;
 	ap_msg->receive = zcrypt_msgtype6_receive;
diff --git a/drivers/s390/crypto/zcrypt_msgtype6.h b/drivers/s390/crypto/zcrypt_msgtype6.h
index 0a0bf07..155c735 100644
--- a/drivers/s390/crypto/zcrypt_msgtype6.h
+++ b/drivers/s390/crypto/zcrypt_msgtype6.h
@@ -19,8 +19,6 @@
 #define MSGTYPE06_VARIANT_NORNG		1
 #define MSGTYPE06_VARIANT_EP11		2
 
-#define MSGTYPE06_MAX_MSG_SIZE		(12*1024)
-
 /**
  * The type 6 message family is associated with CEXxC/CEXxP cards.
  *
diff --git a/drivers/s390/crypto/zcrypt_queue.c b/drivers/s390/crypto/zcrypt_queue.c
index 605904b..20f1228 100644
--- a/drivers/s390/crypto/zcrypt_queue.c
+++ b/drivers/s390/crypto/zcrypt_queue.c
@@ -111,17 +111,17 @@
 	return false;
 }
 
-struct zcrypt_queue *zcrypt_queue_alloc(size_t max_response_size)
+struct zcrypt_queue *zcrypt_queue_alloc(size_t reply_buf_size)
 {
 	struct zcrypt_queue *zq;
 
 	zq = kzalloc(sizeof(struct zcrypt_queue), GFP_KERNEL);
 	if (!zq)
 		return NULL;
-	zq->reply.msg = kmalloc(max_response_size, GFP_KERNEL);
+	zq->reply.msg = kmalloc(reply_buf_size, GFP_KERNEL);
 	if (!zq->reply.msg)
 		goto out_free;
-	zq->reply.len = max_response_size;
+	zq->reply.bufsize = reply_buf_size;
 	INIT_LIST_HEAD(&zq->list);
 	kref_init(&zq->refcount);
 	return zq;
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 47e1358..f39b34b 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -171,7 +171,6 @@
 	CPUHP_AP_PERF_X86_CSTATE_ONLINE,
 	CPUHP_AP_PERF_X86_IDXD_ONLINE,
 	CPUHP_AP_PERF_S390_CF_ONLINE,
-	CPUHP_AP_PERF_S390_CFD_ONLINE,
 	CPUHP_AP_PERF_S390_SF_ONLINE,
 	CPUHP_AP_PERF_ARM_CCI_ONLINE,
 	CPUHP_AP_PERF_ARM_CCN_ONLINE,