Merge tag 'for-linus' of git://git.armlinux.org.uk/~rmk/linux-arm
Pull ARM fixes from Russell King:
- fix for "hex" Kconfig default to use 0x0 rather than 0 to allow these
to be removed from defconfigs
- fix from Ard Biesheuvel for EFI HYP mode booting
* tag 'for-linus' of git://git.armlinux.org.uk/~rmk/linux-arm:
ARM: 8985/1: efi/decompressor: deal with HYP mode boot gracefully
ARM: 8984/1: Kconfig: set default ZBOOT_ROM_TEXT/BSS value to 0x0
diff --git a/Documentation/security/keys/core.rst b/Documentation/security/keys/core.rst
index 9367d0fe..cdc42cc 100644
--- a/Documentation/security/keys/core.rst
+++ b/Documentation/security/keys/core.rst
@@ -1030,6 +1030,63 @@
written into the output buffer. Verification returns 0 on success.
+ * Watch a key or keyring for changes::
+
+ long keyctl(KEYCTL_WATCH_KEY, key_serial_t key, int queue_fd,
+ const struct watch_notification_filter *filter);
+
+ This will set or remove a watch for changes on the specified key or
+ keyring.
+
+ "key" is the ID of the key to be watched.
+
+ "queue_fd" is a file descriptor referring to an open "/dev/watch_queue"
+ which manages the buffer into which notifications will be delivered.
+
+ "filter" is either NULL to remove a watch or a filter specification to
+ indicate what events are required from the key.
+
+ See Documentation/watch_queue.rst for more information.
+
+ Note that only one watch may be emplaced for any particular { key,
+ queue_fd } combination.
+
+ Notification records look like::
+
+ struct key_notification {
+ struct watch_notification watch;
+ __u32 key_id;
+ __u32 aux;
+ };
+
+ In this, watch::type will be "WATCH_TYPE_KEY_NOTIFY" and subtype will be
+ one of::
+
+ NOTIFY_KEY_INSTANTIATED
+ NOTIFY_KEY_UPDATED
+ NOTIFY_KEY_LINKED
+ NOTIFY_KEY_UNLINKED
+ NOTIFY_KEY_CLEARED
+ NOTIFY_KEY_REVOKED
+ NOTIFY_KEY_INVALIDATED
+ NOTIFY_KEY_SETATTR
+
+ Where these indicate a key being instantiated/rejected, updated, a link
+ being made in a keyring, a link being removed from a keyring, a keyring
+ being cleared, a key being revoked, a key being invalidated or a key
+ having one of its attributes changed (user, group, perm, timeout,
+ restriction).
+
+ If a watched key is deleted, a basic watch_notification will be issued
+ with "type" set to WATCH_TYPE_META and "subtype" set to
+ watch_meta_removal_notification. The watchpoint ID will be set in the
+ "info" field.
+
+ This needs to be configured by enabling:
+
+ "Provide key/keyring change notifications" (KEY_NOTIFICATIONS)
+
+
Kernel Services
===============
diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index 1f3da8f..59472cd 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -202,6 +202,7 @@
'W' 00-1F linux/wanrouter.h conflict! (pre 3.9)
'W' 00-3F sound/asound.h conflict!
'W' 40-5F drivers/pci/switch/switchtec.c
+'W' 60-61 linux/watch_queue.h
'X' all fs/xfs/xfs_fs.h, conflict!
fs/xfs/linux-2.6/xfs_ioctl32.h,
include/linux/falloc.h,
diff --git a/Documentation/watch_queue.rst b/Documentation/watch_queue.rst
new file mode 100644
index 0000000..849fad6
--- /dev/null
+++ b/Documentation/watch_queue.rst
@@ -0,0 +1,339 @@
+==============================
+General notification mechanism
+==============================
+
+The general notification mechanism is built on top of the standard pipe driver
+whereby it effectively splices notification messages from the kernel into pipes
+opened by userspace. This can be used in conjunction with::
+
+ * Key/keyring notifications
+
+
+The notifications buffers can be enabled by:
+
+ "General setup"/"General notification queue"
+ (CONFIG_WATCH_QUEUE)
+
+This document has the following sections:
+
+.. contents:: :local:
+
+
+Overview
+========
+
+This facility appears as a pipe that is opened in a special mode. The pipe's
+internal ring buffer is used to hold messages that are generated by the kernel.
+These messages are then read out by read(). Splice and similar are disabled on
+such pipes due to them wanting to, under some circumstances, revert their
+additions to the ring - which might end up interleaved with notification
+messages.
+
+The owner of the pipe has to tell the kernel which sources it would like to
+watch through that pipe. Only sources that have been connected to a pipe will
+insert messages into it. Note that a source may be bound to multiple pipes and
+insert messages into all of them simultaneously.
+
+Filters may also be emplaced on a pipe so that certain source types and
+subevents can be ignored if they're not of interest.
+
+A message will be discarded if there isn't a slot available in the ring or if
+no preallocated message buffer is available. In both of these cases, read()
+will insert a WATCH_META_LOSS_NOTIFICATION message into the output buffer after
+the last message currently in the buffer has been read.
+
+Note that when producing a notification, the kernel does not wait for the
+consumers to collect it, but rather just continues on. This means that
+notifications can be generated whilst spinlocks are held and also protects the
+kernel from being held up indefinitely by a userspace malfunction.
+
+
+Message Structure
+=================
+
+Notification messages begin with a short header::
+
+ struct watch_notification {
+ __u32 type:24;
+ __u32 subtype:8;
+ __u32 info;
+ };
+
+"type" indicates the source of the notification record and "subtype" indicates
+the type of record from that source (see the Watch Sources section below). The
+type may also be "WATCH_TYPE_META". This is a special record type generated
+internally by the watch queue itself. There are two subtypes:
+
+ * WATCH_META_REMOVAL_NOTIFICATION
+ * WATCH_META_LOSS_NOTIFICATION
+
+The first indicates that an object on which a watch was installed was removed
+or destroyed and the second indicates that some messages have been lost.
+
+"info" indicates a bunch of things, including:
+
+ * The length of the message in bytes, including the header (mask with
+ WATCH_INFO_LENGTH and shift by WATCH_INFO_LENGTH__SHIFT). This indicates
+ the size of the record, which may be between 8 and 127 bytes.
+
+ * The watch ID (mask with WATCH_INFO_ID and shift by WATCH_INFO_ID__SHIFT).
+ This indicates that caller's ID of the watch, which may be between 0
+ and 255. Multiple watches may share a queue, and this provides a means to
+ distinguish them.
+
+ * A type-specific field (WATCH_INFO_TYPE_INFO). This is set by the
+ notification producer to indicate some meaning specific to the type and
+ subtype.
+
+Everything in info apart from the length can be used for filtering.
+
+The header can be followed by supplementary information. The format of this is
+at the discretion is defined by the type and subtype.
+
+
+Watch List (Notification Source) API
+====================================
+
+A "watch list" is a list of watchers that are subscribed to a source of
+notifications. A list may be attached to an object (say a key or a superblock)
+or may be global (say for device events). From a userspace perspective, a
+non-global watch list is typically referred to by reference to the object it
+belongs to (such as using KEYCTL_NOTIFY and giving it a key serial number to
+watch that specific key).
+
+To manage a watch list, the following functions are provided:
+
+ * ``void init_watch_list(struct watch_list *wlist,
+ void (*release_watch)(struct watch *wlist));``
+
+ Initialise a watch list. If ``release_watch`` is not NULL, then this
+ indicates a function that should be called when the watch_list object is
+ destroyed to discard any references the watch list holds on the watched
+ object.
+
+ * ``void remove_watch_list(struct watch_list *wlist);``
+
+ This removes all of the watches subscribed to a watch_list and frees them
+ and then destroys the watch_list object itself.
+
+
+Watch Queue (Notification Output) API
+=====================================
+
+A "watch queue" is the buffer allocated by an application that notification
+records will be written into. The workings of this are hidden entirely inside
+of the pipe device driver, but it is necessary to gain a reference to it to set
+a watch. These can be managed with:
+
+ * ``struct watch_queue *get_watch_queue(int fd);``
+
+ Since watch queues are indicated to the kernel by the fd of the pipe that
+ implements the buffer, userspace must hand that fd through a system call.
+ This can be used to look up an opaque pointer to the watch queue from the
+ system call.
+
+ * ``void put_watch_queue(struct watch_queue *wqueue);``
+
+ This discards the reference obtained from ``get_watch_queue()``.
+
+
+Watch Subscription API
+======================
+
+A "watch" is a subscription on a watch list, indicating the watch queue, and
+thus the buffer, into which notification records should be written. The watch
+queue object may also carry filtering rules for that object, as set by
+userspace. Some parts of the watch struct can be set by the driver::
+
+ struct watch {
+ union {
+ u32 info_id; /* ID to be OR'd in to info field */
+ ...
+ };
+ void *private; /* Private data for the watched object */
+ u64 id; /* Internal identifier */
+ ...
+ };
+
+The ``info_id`` value should be an 8-bit number obtained from userspace and
+shifted by WATCH_INFO_ID__SHIFT. This is OR'd into the WATCH_INFO_ID field of
+struct watch_notification::info when and if the notification is written into
+the associated watch queue buffer.
+
+The ``private`` field is the driver's data associated with the watch_list and
+is cleaned up by the ``watch_list::release_watch()`` method.
+
+The ``id`` field is the source's ID. Notifications that are posted with a
+different ID are ignored.
+
+The following functions are provided to manage watches:
+
+ * ``void init_watch(struct watch *watch, struct watch_queue *wqueue);``
+
+ Initialise a watch object, setting its pointer to the watch queue, using
+ appropriate barriering to avoid lockdep complaints.
+
+ * ``int add_watch_to_object(struct watch *watch, struct watch_list *wlist);``
+
+ Subscribe a watch to a watch list (notification source). The
+ driver-settable fields in the watch struct must have been set before this
+ is called.
+
+ * ``int remove_watch_from_object(struct watch_list *wlist,
+ struct watch_queue *wqueue,
+ u64 id, false);``
+
+ Remove a watch from a watch list, where the watch must match the specified
+ watch queue (``wqueue``) and object identifier (``id``). A notification
+ (``WATCH_META_REMOVAL_NOTIFICATION``) is sent to the watch queue to
+ indicate that the watch got removed.
+
+ * ``int remove_watch_from_object(struct watch_list *wlist, NULL, 0, true);``
+
+ Remove all the watches from a watch list. It is expected that this will be
+ called preparatory to destruction and that the watch list will be
+ inaccessible to new watches by this point. A notification
+ (``WATCH_META_REMOVAL_NOTIFICATION``) is sent to the watch queue of each
+ subscribed watch to indicate that the watch got removed.
+
+
+Notification Posting API
+========================
+
+To post a notification to watch list so that the subscribed watches can see it,
+the following function should be used::
+
+ void post_watch_notification(struct watch_list *wlist,
+ struct watch_notification *n,
+ const struct cred *cred,
+ u64 id);
+
+The notification should be preformatted and a pointer to the header (``n``)
+should be passed in. The notification may be larger than this and the size in
+units of buffer slots is noted in ``n->info & WATCH_INFO_LENGTH``.
+
+The ``cred`` struct indicates the credentials of the source (subject) and is
+passed to the LSMs, such as SELinux, to allow or suppress the recording of the
+note in each individual queue according to the credentials of that queue
+(object).
+
+The ``id`` is the ID of the source object (such as the serial number on a key).
+Only watches that have the same ID set in them will see this notification.
+
+
+Watch Sources
+=============
+
+Any particular buffer can be fed from multiple sources. Sources include:
+
+ * WATCH_TYPE_KEY_NOTIFY
+
+ Notifications of this type indicate changes to keys and keyrings, including
+ the changes of keyring contents or the attributes of keys.
+
+ See Documentation/security/keys/core.rst for more information.
+
+
+Event Filtering
+===============
+
+Once a watch queue has been created, a set of filters can be applied to limit
+the events that are received using::
+
+ struct watch_notification_filter filter = {
+ ...
+ };
+ ioctl(fd, IOC_WATCH_QUEUE_SET_FILTER, &filter)
+
+The filter description is a variable of type::
+
+ struct watch_notification_filter {
+ __u32 nr_filters;
+ __u32 __reserved;
+ struct watch_notification_type_filter filters[];
+ };
+
+Where "nr_filters" is the number of filters in filters[] and "__reserved"
+should be 0. The "filters" array has elements of the following type::
+
+ struct watch_notification_type_filter {
+ __u32 type;
+ __u32 info_filter;
+ __u32 info_mask;
+ __u32 subtype_filter[8];
+ };
+
+Where:
+
+ * ``type`` is the event type to filter for and should be something like
+ "WATCH_TYPE_KEY_NOTIFY"
+
+ * ``info_filter`` and ``info_mask`` act as a filter on the info field of the
+ notification record. The notification is only written into the buffer if::
+
+ (watch.info & info_mask) == info_filter
+
+ This could be used, for example, to ignore events that are not exactly on
+ the watched point in a mount tree.
+
+ * ``subtype_filter`` is a bitmask indicating the subtypes that are of
+ interest. Bit 0 of subtype_filter[0] corresponds to subtype 0, bit 1 to
+ subtype 1, and so on.
+
+If the argument to the ioctl() is NULL, then the filters will be removed and
+all events from the watched sources will come through.
+
+
+Userspace Code Example
+======================
+
+A buffer is created with something like the following::
+
+ pipe2(fds, O_TMPFILE);
+ ioctl(fds[1], IOC_WATCH_QUEUE_SET_SIZE, 256);
+
+It can then be set to receive keyring change notifications::
+
+ keyctl(KEYCTL_WATCH_KEY, KEY_SPEC_SESSION_KEYRING, fds[1], 0x01);
+
+The notifications can then be consumed by something like the following::
+
+ static void consumer(int rfd, struct watch_queue_buffer *buf)
+ {
+ unsigned char buffer[128];
+ ssize_t buf_len;
+
+ while (buf_len = read(rfd, buffer, sizeof(buffer)),
+ buf_len > 0
+ ) {
+ void *p = buffer;
+ void *end = buffer + buf_len;
+ while (p < end) {
+ union {
+ struct watch_notification n;
+ unsigned char buf1[128];
+ } n;
+ size_t largest, len;
+
+ largest = end - p;
+ if (largest > 128)
+ largest = 128;
+ memcpy(&n, p, largest);
+
+ len = (n->info & WATCH_INFO_LENGTH) >>
+ WATCH_INFO_LENGTH__SHIFT;
+ if (len == 0 || len > largest)
+ return;
+
+ switch (n.n.type) {
+ case WATCH_TYPE_META:
+ got_meta(&n.n);
+ case WATCH_TYPE_KEY_NOTIFY:
+ saw_key_change(&n.n);
+ break;
+ }
+
+ p += len;
+ }
+ }
+ }
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index ef17903..48f6e22 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -545,7 +545,7 @@
default "4" if !ALPHA_GENERIC && !ALPHA_MARVEL
help
MARVEL support can handle a maximum of 32 CPUs, all the others
- with working support have a maximum of 4 CPUs.
+ with working support have a maximum of 4 CPUs.
config ARCH_DISCONTIGMEM_ENABLE
bool "Discontiguous Memory Support"
@@ -657,7 +657,7 @@
endchoice
config HZ
- int
+ int
default 32 if HZ_32
default 64 if HZ_64
default 128 if HZ_128
diff --git a/arch/alpha/boot/tools/objstrip.c b/arch/alpha/boot/tools/objstrip.c
index 825a16f..08b430d2 100644
--- a/arch/alpha/boot/tools/objstrip.c
+++ b/arch/alpha/boot/tools/objstrip.c
@@ -148,7 +148,7 @@
#ifdef __ELF__
elf = (struct elfhdr *) buf;
- if (elf->e_ident[0] == 0x7f && strncmp((char *)elf->e_ident + 1, "ELF", 3) == 0) {
+ if (elf->e_ident[0] == 0x7f && str_has_prefix((char *)elf->e_ident + 1, "ELF")) {
if (elf->e_type != ET_EXEC) {
fprintf(stderr, "%s: %s is not an ELF executable\n",
prog_name, inname);
diff --git a/arch/alpha/include/asm/io.h b/arch/alpha/include/asm/io.h
index 13bea46..a4d0c19 100644
--- a/arch/alpha/include/asm/io.h
+++ b/arch/alpha/include/asm/io.h
@@ -309,14 +309,18 @@
#if IO_CONCAT(__IO_PREFIX,trivial_io_bw)
extern inline unsigned int ioread8(void __iomem *addr)
{
- unsigned int ret = IO_CONCAT(__IO_PREFIX,ioread8)(addr);
+ unsigned int ret;
+ mb();
+ ret = IO_CONCAT(__IO_PREFIX,ioread8)(addr);
mb();
return ret;
}
extern inline unsigned int ioread16(void __iomem *addr)
{
- unsigned int ret = IO_CONCAT(__IO_PREFIX,ioread16)(addr);
+ unsigned int ret;
+ mb();
+ ret = IO_CONCAT(__IO_PREFIX,ioread16)(addr);
mb();
return ret;
}
@@ -357,7 +361,9 @@
#if IO_CONCAT(__IO_PREFIX,trivial_io_lq)
extern inline unsigned int ioread32(void __iomem *addr)
{
- unsigned int ret = IO_CONCAT(__IO_PREFIX,ioread32)(addr);
+ unsigned int ret;
+ mb();
+ ret = IO_CONCAT(__IO_PREFIX,ioread32)(addr);
mb();
return ret;
}
@@ -402,14 +408,18 @@
extern inline u8 readb(const volatile void __iomem *addr)
{
- u8 ret = __raw_readb(addr);
+ u8 ret;
+ mb();
+ ret = __raw_readb(addr);
mb();
return ret;
}
extern inline u16 readw(const volatile void __iomem *addr)
{
- u16 ret = __raw_readw(addr);
+ u16 ret;
+ mb();
+ ret = __raw_readw(addr);
mb();
return ret;
}
@@ -450,14 +460,18 @@
extern inline u32 readl(const volatile void __iomem *addr)
{
- u32 ret = __raw_readl(addr);
+ u32 ret;
+ mb();
+ ret = __raw_readl(addr);
mb();
return ret;
}
extern inline u64 readq(const volatile void __iomem *addr)
{
- u64 ret = __raw_readq(addr);
+ u64 ret;
+ mb();
+ ret = __raw_readq(addr);
mb();
return ret;
}
@@ -486,14 +500,44 @@
#define outb_p outb
#define outw_p outw
#define outl_p outl
-#define readb_relaxed(addr) __raw_readb(addr)
-#define readw_relaxed(addr) __raw_readw(addr)
-#define readl_relaxed(addr) __raw_readl(addr)
-#define readq_relaxed(addr) __raw_readq(addr)
-#define writeb_relaxed(b, addr) __raw_writeb(b, addr)
-#define writew_relaxed(b, addr) __raw_writew(b, addr)
-#define writel_relaxed(b, addr) __raw_writel(b, addr)
-#define writeq_relaxed(b, addr) __raw_writeq(b, addr)
+
+extern u8 readb_relaxed(const volatile void __iomem *addr);
+extern u16 readw_relaxed(const volatile void __iomem *addr);
+extern u32 readl_relaxed(const volatile void __iomem *addr);
+extern u64 readq_relaxed(const volatile void __iomem *addr);
+
+#if IO_CONCAT(__IO_PREFIX,trivial_io_bw)
+extern inline u8 readb_relaxed(const volatile void __iomem *addr)
+{
+ mb();
+ return __raw_readb(addr);
+}
+
+extern inline u16 readw_relaxed(const volatile void __iomem *addr)
+{
+ mb();
+ return __raw_readw(addr);
+}
+#endif
+
+#if IO_CONCAT(__IO_PREFIX,trivial_io_lq)
+extern inline u32 readl_relaxed(const volatile void __iomem *addr)
+{
+ mb();
+ return __raw_readl(addr);
+}
+
+extern inline u64 readq_relaxed(const volatile void __iomem *addr)
+{
+ mb();
+ return __raw_readq(addr);
+}
+#endif
+
+#define writeb_relaxed writeb
+#define writew_relaxed writew
+#define writel_relaxed writel
+#define writeq_relaxed writeq
/*
* String version of IO memory access ops:
diff --git a/arch/alpha/kernel/io.c b/arch/alpha/kernel/io.c
index c025a3e..938de13 100644
--- a/arch/alpha/kernel/io.c
+++ b/arch/alpha/kernel/io.c
@@ -16,21 +16,27 @@
unsigned int
ioread8(void __iomem *addr)
{
- unsigned int ret = IO_CONCAT(__IO_PREFIX,ioread8)(addr);
+ unsigned int ret;
+ mb();
+ ret = IO_CONCAT(__IO_PREFIX,ioread8)(addr);
mb();
return ret;
}
unsigned int ioread16(void __iomem *addr)
{
- unsigned int ret = IO_CONCAT(__IO_PREFIX,ioread16)(addr);
+ unsigned int ret;
+ mb();
+ ret = IO_CONCAT(__IO_PREFIX,ioread16)(addr);
mb();
return ret;
}
unsigned int ioread32(void __iomem *addr)
{
- unsigned int ret = IO_CONCAT(__IO_PREFIX,ioread32)(addr);
+ unsigned int ret;
+ mb();
+ ret = IO_CONCAT(__IO_PREFIX,ioread32)(addr);
mb();
return ret;
}
@@ -148,28 +154,36 @@
u8 readb(const volatile void __iomem *addr)
{
- u8 ret = __raw_readb(addr);
+ u8 ret;
+ mb();
+ ret = __raw_readb(addr);
mb();
return ret;
}
u16 readw(const volatile void __iomem *addr)
{
- u16 ret = __raw_readw(addr);
+ u16 ret;
+ mb();
+ ret = __raw_readw(addr);
mb();
return ret;
}
u32 readl(const volatile void __iomem *addr)
{
- u32 ret = __raw_readl(addr);
+ u32 ret;
+ mb();
+ ret = __raw_readl(addr);
mb();
return ret;
}
u64 readq(const volatile void __iomem *addr)
{
- u64 ret = __raw_readq(addr);
+ u64 ret;
+ mb();
+ ret = __raw_readq(addr);
mb();
return ret;
}
@@ -207,6 +221,38 @@
EXPORT_SYMBOL(writel);
EXPORT_SYMBOL(writeq);
+/*
+ * The _relaxed functions must be ordered w.r.t. each other, but they don't
+ * have to be ordered w.r.t. other memory accesses.
+ */
+u8 readb_relaxed(const volatile void __iomem *addr)
+{
+ mb();
+ return __raw_readb(addr);
+}
+
+u16 readw_relaxed(const volatile void __iomem *addr)
+{
+ mb();
+ return __raw_readw(addr);
+}
+
+u32 readl_relaxed(const volatile void __iomem *addr)
+{
+ mb();
+ return __raw_readl(addr);
+}
+
+u64 readq_relaxed(const volatile void __iomem *addr)
+{
+ mb();
+ return __raw_readq(addr);
+}
+
+EXPORT_SYMBOL(readb_relaxed);
+EXPORT_SYMBOL(readw_relaxed);
+EXPORT_SYMBOL(readl_relaxed);
+EXPORT_SYMBOL(readq_relaxed);
/*
* Read COUNT 8-bit bytes from port PORT into memory starting at SRC.
diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 94e4cde..d5367a1 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -677,7 +677,7 @@
default:
error = -EOPNOTSUPP;
break;
- };
+ }
return error;
}
diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c
index 7f1925a..8103790 100644
--- a/arch/alpha/kernel/pci_iommu.c
+++ b/arch/alpha/kernel/pci_iommu.c
@@ -638,7 +638,7 @@
while (sg+1 < end && (int) sg[1].dma_address == -1) {
size += sg[1].length;
- sg++;
+ sg = sg_next(sg);
}
npages = iommu_num_pages(paddr, size, PAGE_SIZE);
diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c
index 53520f8..916e42d 100644
--- a/arch/alpha/kernel/setup.c
+++ b/arch/alpha/kernel/setup.c
@@ -253,7 +253,7 @@
/* Fix up for the Jensen's queer RTC placement. */
standard_io_resources[0].start = RTC_PORT(0);
- standard_io_resources[0].end = RTC_PORT(0) + 0x10;
+ standard_io_resources[0].end = RTC_PORT(0) + 0x0f;
for (i = 0; i < ARRAY_SIZE(standard_io_resources); ++i)
request_resource(io, standard_io_resources+i);
@@ -479,7 +479,7 @@
#ifndef alpha_using_srm
/* Assume that we've booted from SRM if we haven't booted from MILO.
Detect the later by looking for "MILO" in the system serial nr. */
- alpha_using_srm = strncmp((const char *)hwrpb->ssn, "MILO", 4) != 0;
+ alpha_using_srm = !str_has_prefix((const char *)hwrpb->ssn, "MILO");
#endif
#ifndef alpha_using_qemu
/* Similarly, look for QEMU. */
@@ -1425,6 +1425,7 @@
static void *
c_next(struct seq_file *f, void *v, loff_t *pos)
{
+ (*pos)++;
return NULL;
}
diff --git a/arch/alpha/kernel/sys_eiger.c b/arch/alpha/kernel/sys_eiger.c
index bf99dcf..aea8a54 100644
--- a/arch/alpha/kernel/sys_eiger.c
+++ b/arch/alpha/kernel/sys_eiger.c
@@ -175,7 +175,7 @@
case 0x03: bridge_count = 2; break; /* 2 */
case 0x07: bridge_count = 3; break; /* 3 */
case 0x0f: bridge_count = 4; break; /* 4 */
- };
+ }
slot = PCI_SLOT(dev->devfn);
while (dev->bus->self) {
diff --git a/arch/openrisc/kernel/entry.S b/arch/openrisc/kernel/entry.S
index a1d25c3..bc657e5 100644
--- a/arch/openrisc/kernel/entry.S
+++ b/arch/openrisc/kernel/entry.S
@@ -1166,13 +1166,13 @@
l.movhi r29,hi(sys_clone)
l.ori r29,r29,lo(sys_clone)
l.j _fork_save_extra_regs_and_call
- l.addi r7,r1,0
+ l.nop
ENTRY(__sys_fork)
l.movhi r29,hi(sys_fork)
l.ori r29,r29,lo(sys_fork)
l.j _fork_save_extra_regs_and_call
- l.addi r3,r1,0
+ l.nop
ENTRY(sys_rt_sigreturn)
l.jal _sys_rt_sigreturn
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d41812a..dde7446 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -181,7 +181,6 @@
select HAVE_HW_BREAKPOINT
select HAVE_IDE
select HAVE_IOREMAP_PROT
- select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64
select HAVE_IRQ_TIME_ACCOUNTING
select HAVE_KERNEL_BZIP2
select HAVE_KERNEL_GZIP
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index 85eb381..b7a5790 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -3,7 +3,13 @@
# Makefile for the x86 low level entry code
#
-OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y
+KASAN_SANITIZE := n
+UBSAN_SANITIZE := n
+KCOV_INSTRUMENT := n
+
+CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE) -fstack-protector -fstack-protector-strong
+CFLAGS_REMOVE_syscall_32.o = $(CC_FLAGS_FTRACE) -fstack-protector -fstack-protector-strong
+CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_FTRACE) -fstack-protector -fstack-protector-strong
CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,)
CFLAGS_syscall_32.o += $(call cc-option,-Wno-override-init,)
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 1c7f13b..4208c1e 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -341,30 +341,13 @@
#endif
.endm
-#endif /* CONFIG_X86_64 */
+#else /* CONFIG_X86_64 */
+# undef UNWIND_HINT_IRET_REGS
+# define UNWIND_HINT_IRET_REGS
+#endif /* !CONFIG_X86_64 */
.macro STACKLEAK_ERASE
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
call stackleak_erase
#endif
.endm
-
-/*
- * This does 'call enter_from_user_mode' unless we can avoid it based on
- * kernel config or using the static jump infrastructure.
- */
-.macro CALL_enter_from_user_mode
-#ifdef CONFIG_CONTEXT_TRACKING
-#ifdef CONFIG_JUMP_LABEL
- STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_key, def=0
-#endif
- call enter_from_user_mode
-.Lafter_call_\@:
-#endif
-.endm
-
-#ifdef CONFIG_PARAVIRT_XXL
-#define GET_CR2_INTO(reg) GET_CR2_INTO_AX ; _ASM_MOV %_ASM_AX, reg
-#else
-#define GET_CR2_INTO(reg) _ASM_MOV %cr2, reg
-#endif
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 76735ec..bd3f141 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -27,6 +27,11 @@
#include <linux/syscalls.h>
#include <linux/uaccess.h>
+#ifdef CONFIG_XEN_PV
+#include <xen/xen-ops.h>
+#include <xen/events.h>
+#endif
+
#include <asm/desc.h>
#include <asm/traps.h>
#include <asm/vdso.h>
@@ -35,21 +40,67 @@
#include <asm/nospec-branch.h>
#include <asm/io_bitmap.h>
#include <asm/syscall.h>
+#include <asm/irq_stack.h>
#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
#ifdef CONFIG_CONTEXT_TRACKING
-/* Called on entry from user mode with IRQs off. */
-__visible inline void enter_from_user_mode(void)
+/**
+ * enter_from_user_mode - Establish state when coming from user mode
+ *
+ * Syscall entry disables interrupts, but user mode is traced as interrupts
+ * enabled. Also with NO_HZ_FULL RCU might be idle.
+ *
+ * 1) Tell lockdep that interrupts are disabled
+ * 2) Invoke context tracking if enabled to reactivate RCU
+ * 3) Trace interrupts off state
+ */
+static noinstr void enter_from_user_mode(void)
{
- CT_WARN_ON(ct_state() != CONTEXT_USER);
+ enum ctx_state state = ct_state();
+
+ lockdep_hardirqs_off(CALLER_ADDR0);
user_exit_irqoff();
+
+ instrumentation_begin();
+ CT_WARN_ON(state != CONTEXT_USER);
+ trace_hardirqs_off_finish();
+ instrumentation_end();
}
#else
-static inline void enter_from_user_mode(void) {}
+static __always_inline void enter_from_user_mode(void)
+{
+ lockdep_hardirqs_off(CALLER_ADDR0);
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+ instrumentation_end();
+}
#endif
+/**
+ * exit_to_user_mode - Fixup state when exiting to user mode
+ *
+ * Syscall exit enables interrupts, but the kernel state is interrupts
+ * disabled when this is invoked. Also tell RCU about it.
+ *
+ * 1) Trace interrupts on state
+ * 2) Invoke context tracking if enabled to adjust RCU state
+ * 3) Clear CPU buffers if CPU is affected by MDS and the migitation is on.
+ * 4) Tell lockdep that interrupts are enabled
+ */
+static __always_inline void exit_to_user_mode(void)
+{
+ instrumentation_begin();
+ trace_hardirqs_on_prepare();
+ lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ instrumentation_end();
+
+ user_enter_irqoff();
+ mds_user_clear_cpu_buffers();
+ lockdep_hardirqs_on(CALLER_ADDR0);
+}
+
static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
{
#ifdef CONFIG_X86_64
@@ -179,8 +230,7 @@
}
}
-/* Called with IRQs disabled. */
-__visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
+static void __prepare_exit_to_usermode(struct pt_regs *regs)
{
struct thread_info *ti = current_thread_info();
u32 cached_flags;
@@ -219,10 +269,14 @@
*/
ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
#endif
+}
- user_enter_irqoff();
-
- mds_user_clear_cpu_buffers();
+__visible noinstr void prepare_exit_to_usermode(struct pt_regs *regs)
+{
+ instrumentation_begin();
+ __prepare_exit_to_usermode(regs);
+ instrumentation_end();
+ exit_to_user_mode();
}
#define SYSCALL_EXIT_WORK_FLAGS \
@@ -251,11 +305,7 @@
tracehook_report_syscall_exit(regs, step);
}
-/*
- * Called with IRQs on and fully valid regs. Returns with IRQs off in a
- * state such that we can immediately switch to user mode.
- */
-__visible inline void syscall_return_slowpath(struct pt_regs *regs)
+static void __syscall_return_slowpath(struct pt_regs *regs)
{
struct thread_info *ti = current_thread_info();
u32 cached_flags = READ_ONCE(ti->flags);
@@ -276,15 +326,29 @@
syscall_slow_exit_work(regs, cached_flags);
local_irq_disable();
- prepare_exit_to_usermode(regs);
+ __prepare_exit_to_usermode(regs);
+}
+
+/*
+ * Called with IRQs on and fully valid regs. Returns with IRQs off in a
+ * state such that we can immediately switch to user mode.
+ */
+__visible noinstr void syscall_return_slowpath(struct pt_regs *regs)
+{
+ instrumentation_begin();
+ __syscall_return_slowpath(regs);
+ instrumentation_end();
+ exit_to_user_mode();
}
#ifdef CONFIG_X86_64
-__visible void do_syscall_64(unsigned long nr, struct pt_regs *regs)
+__visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs)
{
struct thread_info *ti;
enter_from_user_mode();
+ instrumentation_begin();
+
local_irq_enable();
ti = current_thread_info();
if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
@@ -301,8 +365,10 @@
regs->ax = x32_sys_call_table[nr](regs);
#endif
}
+ __syscall_return_slowpath(regs);
- syscall_return_slowpath(regs);
+ instrumentation_end();
+ exit_to_user_mode();
}
#endif
@@ -313,7 +379,7 @@
* extremely hot in workloads that use it, and it's usually called from
* do_fast_syscall_32, so forcibly inline it to improve performance.
*/
-static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
+static void do_syscall_32_irqs_on(struct pt_regs *regs)
{
struct thread_info *ti = current_thread_info();
unsigned int nr = (unsigned int)regs->orig_ax;
@@ -337,27 +403,62 @@
regs->ax = ia32_sys_call_table[nr](regs);
}
- syscall_return_slowpath(regs);
+ __syscall_return_slowpath(regs);
}
/* Handles int $0x80 */
-__visible void do_int80_syscall_32(struct pt_regs *regs)
+__visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
{
enter_from_user_mode();
+ instrumentation_begin();
+
local_irq_enable();
do_syscall_32_irqs_on(regs);
+
+ instrumentation_end();
+ exit_to_user_mode();
+}
+
+static bool __do_fast_syscall_32(struct pt_regs *regs)
+{
+ int res;
+
+ /* Fetch EBP from where the vDSO stashed it. */
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ /*
+ * Micro-optimization: the pointer we're following is
+ * explicitly 32 bits, so it can't be out of range.
+ */
+ res = __get_user(*(u32 *)®s->bp,
+ (u32 __user __force *)(unsigned long)(u32)regs->sp);
+ } else {
+ res = get_user(*(u32 *)®s->bp,
+ (u32 __user __force *)(unsigned long)(u32)regs->sp);
+ }
+
+ if (res) {
+ /* User code screwed up. */
+ regs->ax = -EFAULT;
+ local_irq_disable();
+ __prepare_exit_to_usermode(regs);
+ return false;
+ }
+
+ /* Now this is just like a normal syscall. */
+ do_syscall_32_irqs_on(regs);
+ return true;
}
/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
-__visible long do_fast_syscall_32(struct pt_regs *regs)
+__visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
{
/*
* Called using the internal vDSO SYSENTER/SYSCALL32 calling
* convention. Adjust regs so it looks like we entered using int80.
*/
-
unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
- vdso_image_32.sym_int80_landing_pad;
+ vdso_image_32.sym_int80_landing_pad;
+ bool success;
/*
* SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
@@ -367,33 +468,17 @@
regs->ip = landing_pad;
enter_from_user_mode();
+ instrumentation_begin();
local_irq_enable();
+ success = __do_fast_syscall_32(regs);
- /* Fetch EBP from where the vDSO stashed it. */
- if (
-#ifdef CONFIG_X86_64
- /*
- * Micro-optimization: the pointer we're following is explicitly
- * 32 bits, so it can't be out of range.
- */
- __get_user(*(u32 *)®s->bp,
- (u32 __user __force *)(unsigned long)(u32)regs->sp)
-#else
- get_user(*(u32 *)®s->bp,
- (u32 __user __force *)(unsigned long)(u32)regs->sp)
-#endif
- ) {
+ instrumentation_end();
+ exit_to_user_mode();
- /* User code screwed up. */
- local_irq_disable();
- regs->ax = -EFAULT;
- prepare_exit_to_usermode(regs);
- return 0; /* Keep it simple: use IRET. */
- }
-
- /* Now this is just like a normal syscall. */
- do_syscall_32_irqs_on(regs);
+ /* If it failed, keep it simple: use IRET. */
+ if (!success)
+ return 0;
#ifdef CONFIG_X86_64
/*
@@ -431,3 +516,266 @@
{
return -ENOSYS;
}
+
+/**
+ * idtentry_enter_cond_rcu - Handle state tracking on idtentry with conditional
+ * RCU handling
+ * @regs: Pointer to pt_regs of interrupted context
+ *
+ * Invokes:
+ * - lockdep irqflag state tracking as low level ASM entry disabled
+ * interrupts.
+ *
+ * - Context tracking if the exception hit user mode.
+ *
+ * - The hardirq tracer to keep the state consistent as low level ASM
+ * entry disabled interrupts.
+ *
+ * For kernel mode entries RCU handling is done conditional. If RCU is
+ * watching then the only RCU requirement is to check whether the tick has
+ * to be restarted. If RCU is not watching then rcu_irq_enter() has to be
+ * invoked on entry and rcu_irq_exit() on exit.
+ *
+ * Avoiding the rcu_irq_enter/exit() calls is an optimization but also
+ * solves the problem of kernel mode pagefaults which can schedule, which
+ * is not possible after invoking rcu_irq_enter() without undoing it.
+ *
+ * For user mode entries enter_from_user_mode() must be invoked to
+ * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit
+ * would not be possible.
+ *
+ * Returns: True if RCU has been adjusted on a kernel entry
+ * False otherwise
+ *
+ * The return value must be fed into the rcu_exit argument of
+ * idtentry_exit_cond_rcu().
+ */
+bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs)
+{
+ if (user_mode(regs)) {
+ enter_from_user_mode();
+ return false;
+ }
+
+ /*
+ * If this entry hit the idle task invoke rcu_irq_enter() whether
+ * RCU is watching or not.
+ *
+ * Interupts can nest when the first interrupt invokes softirq
+ * processing on return which enables interrupts.
+ *
+ * Scheduler ticks in the idle task can mark quiescent state and
+ * terminate a grace period, if and only if the timer interrupt is
+ * not nested into another interrupt.
+ *
+ * Checking for __rcu_is_watching() here would prevent the nesting
+ * interrupt to invoke rcu_irq_enter(). If that nested interrupt is
+ * the tick then rcu_flavor_sched_clock_irq() would wrongfully
+ * assume that it is the first interupt and eventually claim
+ * quiescient state and end grace periods prematurely.
+ *
+ * Unconditionally invoke rcu_irq_enter() so RCU state stays
+ * consistent.
+ *
+ * TINY_RCU does not support EQS, so let the compiler eliminate
+ * this part when enabled.
+ */
+ if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) {
+ /*
+ * If RCU is not watching then the same careful
+ * sequence vs. lockdep and tracing is required
+ * as in enter_from_user_mode().
+ */
+ lockdep_hardirqs_off(CALLER_ADDR0);
+ rcu_irq_enter();
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+ instrumentation_end();
+
+ return true;
+ }
+
+ /*
+ * If RCU is watching then RCU only wants to check whether it needs
+ * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
+ * already contains a warning when RCU is not watching, so no point
+ * in having another one here.
+ */
+ instrumentation_begin();
+ rcu_irq_enter_check_tick();
+ /* Use the combo lockdep/tracing function */
+ trace_hardirqs_off();
+ instrumentation_end();
+
+ return false;
+}
+
+static void idtentry_exit_cond_resched(struct pt_regs *regs, bool may_sched)
+{
+ if (may_sched && !preempt_count()) {
+ /* Sanity check RCU and thread stack */
+ rcu_irq_exit_check_preempt();
+ if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
+ WARN_ON_ONCE(!on_thread_stack());
+ if (need_resched())
+ preempt_schedule_irq();
+ }
+ /* Covers both tracing and lockdep */
+ trace_hardirqs_on();
+}
+
+/**
+ * idtentry_exit_cond_rcu - Handle return from exception with conditional RCU
+ * handling
+ * @regs: Pointer to pt_regs (exception entry regs)
+ * @rcu_exit: Invoke rcu_irq_exit() if true
+ *
+ * Depending on the return target (kernel/user) this runs the necessary
+ * preemption and work checks if possible and reguired and returns to
+ * the caller with interrupts disabled and no further work pending.
+ *
+ * This is the last action before returning to the low level ASM code which
+ * just needs to return to the appropriate context.
+ *
+ * Counterpart to idtentry_enter_cond_rcu(). The return value of the entry
+ * function must be fed into the @rcu_exit argument.
+ */
+void noinstr idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit)
+{
+ lockdep_assert_irqs_disabled();
+
+ /* Check whether this returns to user mode */
+ if (user_mode(regs)) {
+ prepare_exit_to_usermode(regs);
+ } else if (regs->flags & X86_EFLAGS_IF) {
+ /*
+ * If RCU was not watching on entry this needs to be done
+ * carefully and needs the same ordering of lockdep/tracing
+ * and RCU as the return to user mode path.
+ */
+ if (rcu_exit) {
+ instrumentation_begin();
+ /* Tell the tracer that IRET will enable interrupts */
+ trace_hardirqs_on_prepare();
+ lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ instrumentation_end();
+ rcu_irq_exit();
+ lockdep_hardirqs_on(CALLER_ADDR0);
+ return;
+ }
+
+ instrumentation_begin();
+ idtentry_exit_cond_resched(regs, IS_ENABLED(CONFIG_PREEMPTION));
+ instrumentation_end();
+ } else {
+ /*
+ * IRQ flags state is correct already. Just tell RCU if it
+ * was not watching on entry.
+ */
+ if (rcu_exit)
+ rcu_irq_exit();
+ }
+}
+
+/**
+ * idtentry_enter_user - Handle state tracking on idtentry from user mode
+ * @regs: Pointer to pt_regs of interrupted context
+ *
+ * Invokes enter_from_user_mode() to establish the proper context for
+ * NOHZ_FULL. Otherwise scheduling on exit would not be possible.
+ */
+void noinstr idtentry_enter_user(struct pt_regs *regs)
+{
+ enter_from_user_mode();
+}
+
+/**
+ * idtentry_exit_user - Handle return from exception to user mode
+ * @regs: Pointer to pt_regs (exception entry regs)
+ *
+ * Runs the necessary preemption and work checks and returns to the caller
+ * with interrupts disabled and no further work pending.
+ *
+ * This is the last action before returning to the low level ASM code which
+ * just needs to return to the appropriate context.
+ *
+ * Counterpart to idtentry_enter_user().
+ */
+void noinstr idtentry_exit_user(struct pt_regs *regs)
+{
+ lockdep_assert_irqs_disabled();
+
+ prepare_exit_to_usermode(regs);
+}
+
+#ifdef CONFIG_XEN_PV
+#ifndef CONFIG_PREEMPTION
+/*
+ * Some hypercalls issued by the toolstack can take many 10s of
+ * seconds. Allow tasks running hypercalls via the privcmd driver to
+ * be voluntarily preempted even if full kernel preemption is
+ * disabled.
+ *
+ * Such preemptible hypercalls are bracketed by
+ * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
+ * calls.
+ */
+DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
+EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
+
+/*
+ * In case of scheduling the flag must be cleared and restored after
+ * returning from schedule as the task might move to a different CPU.
+ */
+static __always_inline bool get_and_clear_inhcall(void)
+{
+ bool inhcall = __this_cpu_read(xen_in_preemptible_hcall);
+
+ __this_cpu_write(xen_in_preemptible_hcall, false);
+ return inhcall;
+}
+
+static __always_inline void restore_inhcall(bool inhcall)
+{
+ __this_cpu_write(xen_in_preemptible_hcall, inhcall);
+}
+#else
+static __always_inline bool get_and_clear_inhcall(void) { return false; }
+static __always_inline void restore_inhcall(bool inhcall) { }
+#endif
+
+static void __xen_pv_evtchn_do_upcall(void)
+{
+ irq_enter_rcu();
+ inc_irq_stat(irq_hv_callback_count);
+
+ xen_hvm_evtchn_do_upcall();
+
+ irq_exit_rcu();
+}
+
+__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs;
+ bool inhcall, rcu_exit;
+
+ rcu_exit = idtentry_enter_cond_rcu(regs);
+ old_regs = set_irq_regs(regs);
+
+ instrumentation_begin();
+ run_on_irqstack_cond(__xen_pv_evtchn_do_upcall, NULL, regs);
+ instrumentation_begin();
+
+ set_irq_regs(old_regs);
+
+ inhcall = get_and_clear_inhcall();
+ if (inhcall && !WARN_ON_ONCE(rcu_exit)) {
+ instrumentation_begin();
+ idtentry_exit_cond_resched(regs, true);
+ instrumentation_end();
+ restore_inhcall(inhcall);
+ } else {
+ idtentry_exit_cond_rcu(regs, rcu_exit);
+ }
+}
+#endif /* CONFIG_XEN_PV */
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index a5eed84..024d7d2 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -44,40 +44,13 @@
#include <asm/asm.h>
#include <asm/smap.h>
#include <asm/frame.h>
+#include <asm/trapnr.h>
#include <asm/nospec-branch.h>
#include "calling.h"
.section .entry.text, "ax"
-/*
- * We use macros for low-level operations which need to be overridden
- * for paravirtualization. The following will never clobber any registers:
- * INTERRUPT_RETURN (aka. "iret")
- * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
- * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
- *
- * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
- * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
- * Allowing a register to be clobbered can shrink the paravirt replacement
- * enough to patch inline, increasing performance.
- */
-
-#ifdef CONFIG_PREEMPTION
-# define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
-#else
-# define preempt_stop(clobbers)
-#endif
-
-.macro TRACE_IRQS_IRET
-#ifdef CONFIG_TRACE_IRQFLAGS
- testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off?
- jz 1f
- TRACE_IRQS_ON
-1:
-#endif
-.endm
-
#define PTI_SWITCH_MASK (1 << PAGE_SHIFT)
/*
@@ -726,10 +699,68 @@
.Lend_\@:
.endm
+
+/**
+ * idtentry - Macro to generate entry stubs for simple IDT entries
+ * @vector: Vector number
+ * @asmsym: ASM symbol for the entry point
+ * @cfunc: C function to be called
+ * @has_error_code: Hardware pushed error code on stack
+ */
+.macro idtentry vector asmsym cfunc has_error_code:req
+SYM_CODE_START(\asmsym)
+ ASM_CLAC
+ cld
+
+ .if \has_error_code == 0
+ pushl $0 /* Clear the error code */
+ .endif
+
+ /* Push the C-function address into the GS slot */
+ pushl $\cfunc
+ /* Invoke the common exception entry */
+ jmp handle_exception
+SYM_CODE_END(\asmsym)
+.endm
+
+.macro idtentry_irq vector cfunc
+ .p2align CONFIG_X86_L1_CACHE_SHIFT
+SYM_CODE_START_LOCAL(asm_\cfunc)
+ ASM_CLAC
+ SAVE_ALL switch_stacks=1
+ ENCODE_FRAME_POINTER
+ movl %esp, %eax
+ movl PT_ORIG_EAX(%esp), %edx /* get the vector from stack */
+ movl $-1, PT_ORIG_EAX(%esp) /* no syscall to restart */
+ call \cfunc
+ jmp handle_exception_return
+SYM_CODE_END(asm_\cfunc)
+.endm
+
+.macro idtentry_sysvec vector cfunc
+ idtentry \vector asm_\cfunc \cfunc has_error_code=0
+.endm
+
+/*
+ * Include the defines which emit the idt entries which are shared
+ * shared between 32 and 64 bit and emit the __irqentry_text_* markers
+ * so the stacktrace boundary checks work.
+ */
+ .align 16
+ .globl __irqentry_text_start
+__irqentry_text_start:
+
+#include <asm/idtentry.h>
+
+ .align 16
+ .globl __irqentry_text_end
+__irqentry_text_end:
+
/*
* %eax: prev task
* %edx: next task
*/
+.pushsection .text, "ax"
SYM_CODE_START(__switch_to_asm)
/*
* Save callee-saved registers
@@ -776,6 +807,7 @@
jmp __switch_to
SYM_CODE_END(__switch_to_asm)
+.popsection
/*
* The unwinder expects the last frame on the stack to always be at the same
@@ -784,6 +816,7 @@
* asmlinkage function so its argument has to be pushed on the stack. This
* wrapper creates a proper "end of stack" frame header before the call.
*/
+.pushsection .text, "ax"
SYM_FUNC_START(schedule_tail_wrapper)
FRAME_BEGIN
@@ -794,6 +827,8 @@
FRAME_END
ret
SYM_FUNC_END(schedule_tail_wrapper)
+.popsection
+
/*
* A newly forked process directly context switches into this address.
*
@@ -801,6 +836,7 @@
* ebx: kernel thread func (NULL for user thread)
* edi: kernel thread arg
*/
+.pushsection .text, "ax"
SYM_CODE_START(ret_from_fork)
call schedule_tail_wrapper
@@ -811,8 +847,7 @@
/* When we fork, we trace the syscall return in the child, too. */
movl %esp, %eax
call syscall_return_slowpath
- STACKLEAK_ERASE
- jmp restore_all
+ jmp .Lsyscall_32_done
/* kernel thread */
1: movl %edi, %eax
@@ -825,38 +860,7 @@
movl $0, PT_EAX(%esp)
jmp 2b
SYM_CODE_END(ret_from_fork)
-
-/*
- * Return to user mode is not as complex as all this looks,
- * but we want the default path for a system call return to
- * go as quickly as possible which is why some of this is
- * less clear than it otherwise should be.
- */
-
- # userspace resumption stub bypassing syscall exit tracing
-SYM_CODE_START_LOCAL(ret_from_exception)
- preempt_stop(CLBR_ANY)
-ret_from_intr:
-#ifdef CONFIG_VM86
- movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
- movb PT_CS(%esp), %al
- andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
-#else
- /*
- * We can be coming here from child spawned by kernel_thread().
- */
- movl PT_CS(%esp), %eax
- andl $SEGMENT_RPL_MASK, %eax
-#endif
- cmpl $USER_RPL, %eax
- jb restore_all_kernel # not returning to v8086 or userspace
-
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF
- movl %esp, %eax
- call prepare_exit_to_usermode
- jmp restore_all
-SYM_CODE_END(ret_from_exception)
+.popsection
SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE)
/*
@@ -960,12 +964,6 @@
jnz .Lsysenter_fix_flags
.Lsysenter_flags_fixed:
- /*
- * User mode is traced as though IRQs are on, and SYSENTER
- * turned them off.
- */
- TRACE_IRQS_OFF
-
movl %esp, %eax
call do_fast_syscall_32
/* XEN PV guests always use IRET path */
@@ -974,8 +972,7 @@
STACKLEAK_ERASE
-/* Opportunistic SYSEXIT */
- TRACE_IRQS_ON /* User mode traces as IRQs on. */
+ /* Opportunistic SYSEXIT */
/*
* Setup entry stack - we keep the pointer in %eax and do the
@@ -1075,20 +1072,12 @@
SAVE_ALL pt_regs_ax=$-ENOSYS switch_stacks=1 /* save rest */
- /*
- * User mode is traced as though IRQs are on, and the interrupt gate
- * turned them off.
- */
- TRACE_IRQS_OFF
-
movl %esp, %eax
call do_int80_syscall_32
.Lsyscall_32_done:
-
STACKLEAK_ERASE
-restore_all:
- TRACE_IRQS_ON
+restore_all_switch_stack:
SWITCH_TO_ENTRY_STACK
CHECK_AND_APPLY_ESPFIX
@@ -1107,26 +1096,10 @@
*/
INTERRUPT_RETURN
-restore_all_kernel:
-#ifdef CONFIG_PREEMPTION
- DISABLE_INTERRUPTS(CLBR_ANY)
- cmpl $0, PER_CPU_VAR(__preempt_count)
- jnz .Lno_preempt
- testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
- jz .Lno_preempt
- call preempt_schedule_irq
-.Lno_preempt:
-#endif
- TRACE_IRQS_IRET
- PARANOID_EXIT_TO_KERNEL_MODE
- BUG_IF_WRONG_CR3
- RESTORE_REGS 4
- jmp .Lirq_return
-
.section .fixup, "ax"
-SYM_CODE_START(iret_exc)
+SYM_CODE_START(asm_iret_error)
pushl $0 # no error code
- pushl $do_iret_error
+ pushl $iret_error
#ifdef CONFIG_DEBUG_ENTRY
/*
@@ -1140,10 +1113,10 @@
popl %eax
#endif
- jmp common_exception
-SYM_CODE_END(iret_exc)
+ jmp handle_exception
+SYM_CODE_END(asm_iret_error)
.previous
- _ASM_EXTABLE(.Lirq_return, iret_exc)
+ _ASM_EXTABLE(.Lirq_return, asm_iret_error)
SYM_FUNC_END(entry_INT80_32)
.macro FIXUP_ESPFIX_STACK
@@ -1193,192 +1166,21 @@
#endif
.endm
-/*
- * Build the entry stubs with some assembler magic.
- * We pack 1 stub into every 8-byte block.
- */
- .align 8
-SYM_CODE_START(irq_entries_start)
- vector=FIRST_EXTERNAL_VECTOR
- .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
- pushl $(~vector+0x80) /* Note: always in signed byte range */
- vector=vector+1
- jmp common_interrupt
- .align 8
- .endr
-SYM_CODE_END(irq_entries_start)
-
-#ifdef CONFIG_X86_LOCAL_APIC
- .align 8
-SYM_CODE_START(spurious_entries_start)
- vector=FIRST_SYSTEM_VECTOR
- .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR)
- pushl $(~vector+0x80) /* Note: always in signed byte range */
- vector=vector+1
- jmp common_spurious
- .align 8
- .endr
-SYM_CODE_END(spurious_entries_start)
-
-SYM_CODE_START_LOCAL(common_spurious)
- ASM_CLAC
- addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */
- SAVE_ALL switch_stacks=1
- ENCODE_FRAME_POINTER
- TRACE_IRQS_OFF
- movl %esp, %eax
- call smp_spurious_interrupt
- jmp ret_from_intr
-SYM_CODE_END(common_spurious)
-#endif
-
-/*
- * the CPU automatically disables interrupts when executing an IRQ vector,
- * so IRQ-flags tracing has to follow that:
- */
- .p2align CONFIG_X86_L1_CACHE_SHIFT
-SYM_CODE_START_LOCAL(common_interrupt)
- ASM_CLAC
- addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */
-
- SAVE_ALL switch_stacks=1
- ENCODE_FRAME_POINTER
- TRACE_IRQS_OFF
- movl %esp, %eax
- call do_IRQ
- jmp ret_from_intr
-SYM_CODE_END(common_interrupt)
-
-#define BUILD_INTERRUPT3(name, nr, fn) \
-SYM_FUNC_START(name) \
- ASM_CLAC; \
- pushl $~(nr); \
- SAVE_ALL switch_stacks=1; \
- ENCODE_FRAME_POINTER; \
- TRACE_IRQS_OFF \
- movl %esp, %eax; \
- call fn; \
- jmp ret_from_intr; \
-SYM_FUNC_END(name)
-
-#define BUILD_INTERRUPT(name, nr) \
- BUILD_INTERRUPT3(name, nr, smp_##name); \
-
-/* The include is where all of the SMP etc. interrupts come from */
-#include <asm/entry_arch.h>
-
-SYM_CODE_START(coprocessor_error)
- ASM_CLAC
- pushl $0
- pushl $do_coprocessor_error
- jmp common_exception
-SYM_CODE_END(coprocessor_error)
-
-SYM_CODE_START(simd_coprocessor_error)
- ASM_CLAC
- pushl $0
-#ifdef CONFIG_X86_INVD_BUG
- /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
- ALTERNATIVE "pushl $do_general_protection", \
- "pushl $do_simd_coprocessor_error", \
- X86_FEATURE_XMM
-#else
- pushl $do_simd_coprocessor_error
-#endif
- jmp common_exception
-SYM_CODE_END(simd_coprocessor_error)
-
-SYM_CODE_START(device_not_available)
- ASM_CLAC
- pushl $0
- pushl $do_device_not_available
- jmp common_exception
-SYM_CODE_END(device_not_available)
-
#ifdef CONFIG_PARAVIRT
SYM_CODE_START(native_iret)
iret
- _ASM_EXTABLE(native_iret, iret_exc)
+ _ASM_EXTABLE(native_iret, asm_iret_error)
SYM_CODE_END(native_iret)
#endif
-SYM_CODE_START(overflow)
- ASM_CLAC
- pushl $0
- pushl $do_overflow
- jmp common_exception
-SYM_CODE_END(overflow)
-
-SYM_CODE_START(bounds)
- ASM_CLAC
- pushl $0
- pushl $do_bounds
- jmp common_exception
-SYM_CODE_END(bounds)
-
-SYM_CODE_START(invalid_op)
- ASM_CLAC
- pushl $0
- pushl $do_invalid_op
- jmp common_exception
-SYM_CODE_END(invalid_op)
-
-SYM_CODE_START(coprocessor_segment_overrun)
- ASM_CLAC
- pushl $0
- pushl $do_coprocessor_segment_overrun
- jmp common_exception
-SYM_CODE_END(coprocessor_segment_overrun)
-
-SYM_CODE_START(invalid_TSS)
- ASM_CLAC
- pushl $do_invalid_TSS
- jmp common_exception
-SYM_CODE_END(invalid_TSS)
-
-SYM_CODE_START(segment_not_present)
- ASM_CLAC
- pushl $do_segment_not_present
- jmp common_exception
-SYM_CODE_END(segment_not_present)
-
-SYM_CODE_START(stack_segment)
- ASM_CLAC
- pushl $do_stack_segment
- jmp common_exception
-SYM_CODE_END(stack_segment)
-
-SYM_CODE_START(alignment_check)
- ASM_CLAC
- pushl $do_alignment_check
- jmp common_exception
-SYM_CODE_END(alignment_check)
-
-SYM_CODE_START(divide_error)
- ASM_CLAC
- pushl $0 # no error code
- pushl $do_divide_error
- jmp common_exception
-SYM_CODE_END(divide_error)
-
-#ifdef CONFIG_X86_MCE
-SYM_CODE_START(machine_check)
- ASM_CLAC
- pushl $0
- pushl $do_mce
- jmp common_exception
-SYM_CODE_END(machine_check)
-#endif
-
-SYM_CODE_START(spurious_interrupt_bug)
- ASM_CLAC
- pushl $0
- pushl $do_spurious_interrupt_bug
- jmp common_exception
-SYM_CODE_END(spurious_interrupt_bug)
-
#ifdef CONFIG_XEN_PV
-SYM_FUNC_START(xen_hypervisor_callback)
+/*
+ * See comment in entry_64.S for further explanation
+ *
+ * Note: This is not an actual IDT entry point. It's a XEN specific entry
+ * point and therefore named to match the 64-bit trampoline counterpart.
+ */
+SYM_FUNC_START(xen_asm_exc_xen_hypervisor_callback)
/*
* Check to see if we got the event in the critical
* region in xen_iret_direct, after we've reenabled
@@ -1395,14 +1197,11 @@
pushl $-1 /* orig_ax = -1 => not a system call */
SAVE_ALL
ENCODE_FRAME_POINTER
- TRACE_IRQS_OFF
+
mov %esp, %eax
- call xen_evtchn_do_upcall
-#ifndef CONFIG_PREEMPTION
- call xen_maybe_preempt_hcall
-#endif
- jmp ret_from_intr
-SYM_FUNC_END(xen_hypervisor_callback)
+ call xen_pv_evtchn_do_upcall
+ jmp handle_exception_return
+SYM_FUNC_END(xen_asm_exc_xen_hypervisor_callback)
/*
* Hypervisor uses this for application faults while it executes.
@@ -1429,11 +1228,11 @@
popl %eax
lea 16(%esp), %esp
jz 5f
- jmp iret_exc
+ jmp asm_iret_error
5: pushl $-1 /* orig_ax = -1 => not a system call */
SAVE_ALL
ENCODE_FRAME_POINTER
- jmp ret_from_exception
+ jmp handle_exception_return
.section .fixup, "ax"
6: xorl %eax, %eax
@@ -1456,56 +1255,7 @@
SYM_FUNC_END(xen_failsafe_callback)
#endif /* CONFIG_XEN_PV */
-#ifdef CONFIG_XEN_PVHVM
-BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
- xen_evtchn_do_upcall)
-#endif
-
-
-#if IS_ENABLED(CONFIG_HYPERV)
-
-BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
- hyperv_vector_handler)
-
-BUILD_INTERRUPT3(hyperv_reenlightenment_vector, HYPERV_REENLIGHTENMENT_VECTOR,
- hyperv_reenlightenment_intr)
-
-BUILD_INTERRUPT3(hv_stimer0_callback_vector, HYPERV_STIMER0_VECTOR,
- hv_stimer0_vector_handler)
-
-#endif /* CONFIG_HYPERV */
-
-SYM_CODE_START(page_fault)
- ASM_CLAC
- pushl $do_page_fault
- jmp common_exception_read_cr2
-SYM_CODE_END(page_fault)
-
-SYM_CODE_START_LOCAL_NOALIGN(common_exception_read_cr2)
- /* the function address is in %gs's slot on the stack */
- SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1
-
- ENCODE_FRAME_POINTER
-
- /* fixup %gs */
- GS_TO_REG %ecx
- movl PT_GS(%esp), %edi
- REG_TO_PTGS %ecx
- SET_KERNEL_GS %ecx
-
- GET_CR2_INTO(%ecx) # might clobber %eax
-
- /* fixup orig %eax */
- movl PT_ORIG_EAX(%esp), %edx # get the error code
- movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
-
- TRACE_IRQS_OFF
- movl %esp, %eax # pt_regs pointer
- CALL_NOSPEC edi
- jmp ret_from_exception
-SYM_CODE_END(common_exception_read_cr2)
-
-SYM_CODE_START_LOCAL_NOALIGN(common_exception)
+SYM_CODE_START_LOCAL_NOALIGN(handle_exception)
/* the function address is in %gs's slot on the stack */
SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1
ENCODE_FRAME_POINTER
@@ -1520,23 +1270,35 @@
movl PT_ORIG_EAX(%esp), %edx # get the error code
movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
- TRACE_IRQS_OFF
movl %esp, %eax # pt_regs pointer
CALL_NOSPEC edi
- jmp ret_from_exception
-SYM_CODE_END(common_exception)
-SYM_CODE_START(debug)
+handle_exception_return:
+#ifdef CONFIG_VM86
+ movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
+ movb PT_CS(%esp), %al
+ andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
+#else
/*
- * Entry from sysenter is now handled in common_exception
+ * We can be coming here from child spawned by kernel_thread().
*/
- ASM_CLAC
- pushl $0
- pushl $do_debug
- jmp common_exception
-SYM_CODE_END(debug)
+ movl PT_CS(%esp), %eax
+ andl $SEGMENT_RPL_MASK, %eax
+#endif
+ cmpl $USER_RPL, %eax # returning to v8086 or userspace ?
+ jnb ret_to_user
-SYM_CODE_START(double_fault)
+ PARANOID_EXIT_TO_KERNEL_MODE
+ BUG_IF_WRONG_CR3
+ RESTORE_REGS 4
+ jmp .Lirq_return
+
+ret_to_user:
+ movl %esp, %eax
+ jmp restore_all_switch_stack
+SYM_CODE_END(handle_exception)
+
+SYM_CODE_START(asm_exc_double_fault)
1:
/*
* This is a task gate handler, not an interrupt gate handler.
@@ -1574,7 +1336,7 @@
1:
hlt
jmp 1b
-SYM_CODE_END(double_fault)
+SYM_CODE_END(asm_exc_double_fault)
/*
* NMI is doubly nasty. It can happen on the first instruction of
@@ -1583,7 +1345,7 @@
* switched stacks. We handle both conditions by simply checking whether we
* interrupted kernel code running on the SYSENTER stack.
*/
-SYM_CODE_START(nmi)
+SYM_CODE_START(asm_exc_nmi)
ASM_CLAC
#ifdef CONFIG_X86_ESPFIX32
@@ -1612,7 +1374,7 @@
jb .Lnmi_from_sysenter_stack
/* Not on SYSENTER stack. */
- call do_nmi
+ call exc_nmi
jmp .Lnmi_return
.Lnmi_from_sysenter_stack:
@@ -1622,7 +1384,7 @@
*/
movl %esp, %ebx
movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
- call do_nmi
+ call exc_nmi
movl %ebx, %esp
.Lnmi_return:
@@ -1676,21 +1438,9 @@
lss (1+5+6)*4(%esp), %esp # back to espfix stack
jmp .Lirq_return
#endif
-SYM_CODE_END(nmi)
+SYM_CODE_END(asm_exc_nmi)
-SYM_CODE_START(int3)
- ASM_CLAC
- pushl $0
- pushl $do_int3
- jmp common_exception
-SYM_CODE_END(int3)
-
-SYM_CODE_START(general_protection)
- ASM_CLAC
- pushl $do_general_protection
- jmp common_exception
-SYM_CODE_END(general_protection)
-
+.pushsection .text, "ax"
SYM_CODE_START(rewind_stack_do_exit)
/* Prevent any naive code from trying to unwind to our caller. */
xorl %ebp, %ebp
@@ -1701,3 +1451,4 @@
call do_exit
1: jmp 1b
SYM_CODE_END(rewind_stack_do_exit)
+.popsection
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index eead1e2..d2a00c9 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -16,7 +16,6 @@
*
* Some macro usage:
* - SYM_FUNC_START/END:Define functions in the symbol table.
- * - TRACE_IRQ_*: Trace hardirq state for lock debugging.
* - idtentry: Define exception entry points.
*/
#include <linux/linkage.h>
@@ -37,6 +36,7 @@
#include <asm/pgtable_types.h>
#include <asm/export.h>
#include <asm/frame.h>
+#include <asm/trapnr.h>
#include <asm/nospec-branch.h>
#include <linux/err.h>
@@ -53,57 +53,6 @@
SYM_CODE_END(native_usergs_sysret64)
#endif /* CONFIG_PARAVIRT */
-.macro TRACE_IRQS_FLAGS flags:req
-#ifdef CONFIG_TRACE_IRQFLAGS
- btl $9, \flags /* interrupts off? */
- jnc 1f
- TRACE_IRQS_ON
-1:
-#endif
-.endm
-
-.macro TRACE_IRQS_IRETQ
- TRACE_IRQS_FLAGS EFLAGS(%rsp)
-.endm
-
-/*
- * When dynamic function tracer is enabled it will add a breakpoint
- * to all locations that it is about to modify, sync CPUs, update
- * all the code, sync CPUs, then remove the breakpoints. In this time
- * if lockdep is enabled, it might jump back into the debug handler
- * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF).
- *
- * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to
- * make sure the stack pointer does not get reset back to the top
- * of the debug stack, and instead just reuses the current stack.
- */
-#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS)
-
-.macro TRACE_IRQS_OFF_DEBUG
- call debug_stack_set_zero
- TRACE_IRQS_OFF
- call debug_stack_reset
-.endm
-
-.macro TRACE_IRQS_ON_DEBUG
- call debug_stack_set_zero
- TRACE_IRQS_ON
- call debug_stack_reset
-.endm
-
-.macro TRACE_IRQS_IRETQ_DEBUG
- btl $9, EFLAGS(%rsp) /* interrupts off? */
- jnc 1f
- TRACE_IRQS_ON_DEBUG
-1:
-.endm
-
-#else
-# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF
-# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON
-# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ
-#endif
-
/*
* 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
*
@@ -144,11 +93,6 @@
SYM_CODE_START(entry_SYSCALL_64)
UNWIND_HINT_EMPTY
- /*
- * Interrupts are off on entry.
- * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
- * it is too small to ever cause noticeable irq latency.
- */
swapgs
/* tss.sp2 is scratch space. */
@@ -167,15 +111,11 @@
PUSH_AND_CLEAR_REGS rax=$-ENOSYS
- TRACE_IRQS_OFF
-
/* IRQs are off. */
movq %rax, %rdi
movq %rsp, %rsi
call do_syscall_64 /* returns with IRQs disabled */
- TRACE_IRQS_ON /* return enables interrupts */
-
/*
* Try to use SYSRET instead of IRET if we're returning to
* a completely clean 64-bit userspace context. If we're not,
@@ -279,6 +219,7 @@
* %rdi: prev task
* %rsi: next task
*/
+.pushsection .text, "ax"
SYM_FUNC_START(__switch_to_asm)
/*
* Save callee-saved registers
@@ -321,6 +262,7 @@
jmp __switch_to
SYM_FUNC_END(__switch_to_asm)
+.popsection
/*
* A newly forked process directly context switches into this address.
@@ -329,6 +271,7 @@
* rbx: kernel thread func (NULL for user thread)
* r12: kernel thread arg
*/
+.pushsection .text, "ax"
SYM_CODE_START(ret_from_fork)
UNWIND_HINT_EMPTY
movq %rax, %rdi
@@ -341,7 +284,6 @@
UNWIND_HINT_REGS
movq %rsp, %rdi
call syscall_return_slowpath /* returns with IRQs disabled */
- TRACE_IRQS_ON /* user mode is traced as IRQS on */
jmp swapgs_restore_regs_and_return_to_usermode
1:
@@ -357,34 +299,7 @@
movq $0, RAX(%rsp)
jmp 2b
SYM_CODE_END(ret_from_fork)
-
-/*
- * Build the entry stubs with some assembler magic.
- * We pack 1 stub into every 8-byte block.
- */
- .align 8
-SYM_CODE_START(irq_entries_start)
- vector=FIRST_EXTERNAL_VECTOR
- .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
- UNWIND_HINT_IRET_REGS
- pushq $(~vector+0x80) /* Note: always in signed byte range */
- jmp common_interrupt
- .align 8
- vector=vector+1
- .endr
-SYM_CODE_END(irq_entries_start)
-
- .align 8
-SYM_CODE_START(spurious_entries_start)
- vector=FIRST_SYSTEM_VECTOR
- .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR)
- UNWIND_HINT_IRET_REGS
- pushq $(~vector+0x80) /* Note: always in signed byte range */
- jmp common_spurious
- .align 8
- vector=vector+1
- .endr
-SYM_CODE_END(spurious_entries_start)
+.popsection
.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
#ifdef CONFIG_DEBUG_ENTRY
@@ -398,228 +313,185 @@
#endif
.endm
-/*
- * Enters the IRQ stack if we're not already using it. NMI-safe. Clobbers
- * flags and puts old RSP into old_rsp, and leaves all other GPRs alone.
- * Requires kernel GSBASE.
- *
- * The invariant is that, if irq_count != -1, then the IRQ stack is in use.
+/**
+ * idtentry_body - Macro to emit code calling the C function
+ * @cfunc: C function to be called
+ * @has_error_code: Hardware pushed error code on stack
*/
-.macro ENTER_IRQ_STACK regs=1 old_rsp save_ret=0
- DEBUG_ENTRY_ASSERT_IRQS_OFF
+.macro idtentry_body cfunc has_error_code:req
- .if \save_ret
- /*
- * If save_ret is set, the original stack contains one additional
- * entry -- the return address. Therefore, move the address one
- * entry below %rsp to \old_rsp.
- */
- leaq 8(%rsp), \old_rsp
- .else
- movq %rsp, \old_rsp
- .endif
-
- .if \regs
- UNWIND_HINT_REGS base=\old_rsp
- .endif
-
- incl PER_CPU_VAR(irq_count)
- jnz .Lirq_stack_push_old_rsp_\@
-
- /*
- * Right now, if we just incremented irq_count to zero, we've
- * claimed the IRQ stack but we haven't switched to it yet.
- *
- * If anything is added that can interrupt us here without using IST,
- * it must be *extremely* careful to limit its stack usage. This
- * could include kprobes and a hypothetical future IST-less #DB
- * handler.
- *
- * The OOPS unwinder relies on the word at the top of the IRQ
- * stack linking back to the previous RSP for the entire time we're
- * on the IRQ stack. For this to work reliably, we need to write
- * it before we actually move ourselves to the IRQ stack.
- */
-
- movq \old_rsp, PER_CPU_VAR(irq_stack_backing_store + IRQ_STACK_SIZE - 8)
- movq PER_CPU_VAR(hardirq_stack_ptr), %rsp
-
-#ifdef CONFIG_DEBUG_ENTRY
- /*
- * If the first movq above becomes wrong due to IRQ stack layout
- * changes, the only way we'll notice is if we try to unwind right
- * here. Assert that we set up the stack right to catch this type
- * of bug quickly.
- */
- cmpq -8(%rsp), \old_rsp
- je .Lirq_stack_okay\@
- ud2
- .Lirq_stack_okay\@:
-#endif
-
-.Lirq_stack_push_old_rsp_\@:
- pushq \old_rsp
-
- .if \regs
- UNWIND_HINT_REGS indirect=1
- .endif
-
- .if \save_ret
- /*
- * Push the return address to the stack. This return address can
- * be found at the "real" original RSP, which was offset by 8 at
- * the beginning of this macro.
- */
- pushq -8(\old_rsp)
- .endif
-.endm
-
-/*
- * Undoes ENTER_IRQ_STACK.
- */
-.macro LEAVE_IRQ_STACK regs=1
- DEBUG_ENTRY_ASSERT_IRQS_OFF
- /* We need to be off the IRQ stack before decrementing irq_count. */
- popq %rsp
-
- .if \regs
+ call error_entry
UNWIND_HINT_REGS
+
+ movq %rsp, %rdi /* pt_regs pointer into 1st argument*/
+
+ .if \has_error_code == 1
+ movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
+ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
.endif
- /*
- * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming
- * the irq stack but we're not on it.
- */
+ call \cfunc
- decl PER_CPU_VAR(irq_count)
+ jmp error_return
+.endm
+
+/**
+ * idtentry - Macro to generate entry stubs for simple IDT entries
+ * @vector: Vector number
+ * @asmsym: ASM symbol for the entry point
+ * @cfunc: C function to be called
+ * @has_error_code: Hardware pushed error code on stack
+ *
+ * The macro emits code to set up the kernel context for straight forward
+ * and simple IDT entries. No IST stack, no paranoid entry checks.
+ */
+.macro idtentry vector asmsym cfunc has_error_code:req
+SYM_CODE_START(\asmsym)
+ UNWIND_HINT_IRET_REGS offset=\has_error_code*8
+ ASM_CLAC
+
+ .if \has_error_code == 0
+ pushq $-1 /* ORIG_RAX: no syscall to restart */
+ .endif
+
+ .if \vector == X86_TRAP_BP
+ /*
+ * If coming from kernel space, create a 6-word gap to allow the
+ * int3 handler to emulate a call instruction.
+ */
+ testb $3, CS-ORIG_RAX(%rsp)
+ jnz .Lfrom_usermode_no_gap_\@
+ .rept 6
+ pushq 5*8(%rsp)
+ .endr
+ UNWIND_HINT_IRET_REGS offset=8
+.Lfrom_usermode_no_gap_\@:
+ .endif
+
+ idtentry_body \cfunc \has_error_code
+
+_ASM_NOKPROBE(\asmsym)
+SYM_CODE_END(\asmsym)
.endm
/*
- * Interrupt entry helper function.
+ * Interrupt entry/exit.
*
- * Entry runs with interrupts off. Stack layout at entry:
- * +----------------------------------------------------+
- * | regs->ss |
- * | regs->rsp |
- * | regs->eflags |
- * | regs->cs |
- * | regs->ip |
- * +----------------------------------------------------+
- * | regs->orig_ax = ~(interrupt number) |
- * +----------------------------------------------------+
- * | return address |
- * +----------------------------------------------------+
+ + The interrupt stubs push (vector) onto the stack, which is the error_code
+ * position of idtentry exceptions, and jump to one of the two idtentry points
+ * (common/spurious).
+ *
+ * common_interrupt is a hotpath, align it to a cache line
*/
-SYM_CODE_START(interrupt_entry)
- UNWIND_HINT_IRET_REGS offset=16
- ASM_CLAC
- cld
-
- testb $3, CS-ORIG_RAX+8(%rsp)
- jz 1f
- SWAPGS
- FENCE_SWAPGS_USER_ENTRY
- /*
- * Switch to the thread stack. The IRET frame and orig_ax are
- * on the stack, as well as the return address. RDI..R12 are
- * not (yet) on the stack and space has not (yet) been
- * allocated for them.
- */
- pushq %rdi
-
- /* Need to switch before accessing the thread stack. */
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
- movq %rsp, %rdi
- movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
-
- /*
- * We have RDI, return address, and orig_ax on the stack on
- * top of the IRET frame. That means offset=24
- */
- UNWIND_HINT_IRET_REGS base=%rdi offset=24
-
- pushq 7*8(%rdi) /* regs->ss */
- pushq 6*8(%rdi) /* regs->rsp */
- pushq 5*8(%rdi) /* regs->eflags */
- pushq 4*8(%rdi) /* regs->cs */
- pushq 3*8(%rdi) /* regs->ip */
- UNWIND_HINT_IRET_REGS
- pushq 2*8(%rdi) /* regs->orig_ax */
- pushq 8(%rdi) /* return address */
-
- movq (%rdi), %rdi
- jmp 2f
-1:
- FENCE_SWAPGS_KERNEL_ENTRY
-2:
- PUSH_AND_CLEAR_REGS save_ret=1
- ENCODE_FRAME_POINTER 8
-
- testb $3, CS+8(%rsp)
- jz 1f
-
- /*
- * IRQ from user mode.
- *
- * We need to tell lockdep that IRQs are off. We can't do this until
- * we fix gsbase, and we should do it before enter_from_user_mode
- * (which can take locks). Since TRACE_IRQS_OFF is idempotent,
- * the simplest way to handle it is to just call it twice if
- * we enter from user mode. There's no reason to optimize this since
- * TRACE_IRQS_OFF is a no-op if lockdep is off.
- */
- TRACE_IRQS_OFF
-
- CALL_enter_from_user_mode
-
-1:
- ENTER_IRQ_STACK old_rsp=%rdi save_ret=1
- /* We entered an interrupt context - irqs are off: */
- TRACE_IRQS_OFF
-
- ret
-SYM_CODE_END(interrupt_entry)
-_ASM_NOKPROBE(interrupt_entry)
-
-
-/* Interrupt entry/exit. */
+.macro idtentry_irq vector cfunc
+ .p2align CONFIG_X86_L1_CACHE_SHIFT
+ idtentry \vector asm_\cfunc \cfunc has_error_code=1
+.endm
/*
- * The interrupt stubs push (~vector+0x80) onto the stack and
- * then jump to common_spurious/interrupt.
+ * System vectors which invoke their handlers directly and are not
+ * going through the regular common device interrupt handling code.
*/
-SYM_CODE_START_LOCAL(common_spurious)
- addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */
- call interrupt_entry
- UNWIND_HINT_REGS indirect=1
- call smp_spurious_interrupt /* rdi points to pt_regs */
- jmp ret_from_intr
-SYM_CODE_END(common_spurious)
-_ASM_NOKPROBE(common_spurious)
+.macro idtentry_sysvec vector cfunc
+ idtentry \vector asm_\cfunc \cfunc has_error_code=0
+.endm
-/* common_interrupt is a hotpath. Align it */
- .p2align CONFIG_X86_L1_CACHE_SHIFT
-SYM_CODE_START_LOCAL(common_interrupt)
- addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */
- call interrupt_entry
- UNWIND_HINT_REGS indirect=1
- call do_IRQ /* rdi points to pt_regs */
- /* 0(%rsp): old RSP */
-ret_from_intr:
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF
+/**
+ * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB
+ * @vector: Vector number
+ * @asmsym: ASM symbol for the entry point
+ * @cfunc: C function to be called
+ *
+ * The macro emits code to set up the kernel context for #MC and #DB
+ *
+ * If the entry comes from user space it uses the normal entry path
+ * including the return to user space work and preemption checks on
+ * exit.
+ *
+ * If hits in kernel mode then it needs to go through the paranoid
+ * entry as the exception can hit any random state. No preemption
+ * check on exit to keep the paranoid path simple.
+ */
+.macro idtentry_mce_db vector asmsym cfunc
+SYM_CODE_START(\asmsym)
+ UNWIND_HINT_IRET_REGS
+ ASM_CLAC
- LEAVE_IRQ_STACK
+ pushq $-1 /* ORIG_RAX: no syscall to restart */
- testb $3, CS(%rsp)
- jz retint_kernel
+ /*
+ * If the entry is from userspace, switch stacks and treat it as
+ * a normal entry.
+ */
+ testb $3, CS-ORIG_RAX(%rsp)
+ jnz .Lfrom_usermode_switch_stack_\@
- /* Interrupt came from user space */
-.Lretint_user:
- mov %rsp,%rdi
- call prepare_exit_to_usermode
- TRACE_IRQS_ON
+ /*
+ * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX.
+ * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS
+ */
+ call paranoid_entry
+ UNWIND_HINT_REGS
+
+ movq %rsp, %rdi /* pt_regs pointer */
+
+ call \cfunc
+
+ jmp paranoid_exit
+
+ /* Switch to the regular task stack and use the noist entry point */
+.Lfrom_usermode_switch_stack_\@:
+ idtentry_body noist_\cfunc, has_error_code=0
+
+_ASM_NOKPROBE(\asmsym)
+SYM_CODE_END(\asmsym)
+.endm
+
+/*
+ * Double fault entry. Straight paranoid. No checks from which context
+ * this comes because for the espfix induced #DF this would do the wrong
+ * thing.
+ */
+.macro idtentry_df vector asmsym cfunc
+SYM_CODE_START(\asmsym)
+ UNWIND_HINT_IRET_REGS offset=8
+ ASM_CLAC
+
+ /*
+ * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX.
+ * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS
+ */
+ call paranoid_entry
+ UNWIND_HINT_REGS
+
+ movq %rsp, %rdi /* pt_regs pointer into first argument */
+ movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
+ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
+ call \cfunc
+
+ jmp paranoid_exit
+
+_ASM_NOKPROBE(\asmsym)
+SYM_CODE_END(\asmsym)
+.endm
+
+/*
+ * Include the defines which emit the idt entries which are shared
+ * shared between 32 and 64 bit and emit the __irqentry_text_* markers
+ * so the stacktrace boundary checks work.
+ */
+ .align 16
+ .globl __irqentry_text_start
+__irqentry_text_start:
+
+#include <asm/idtentry.h>
+
+ .align 16
+ .globl __irqentry_text_end
+__irqentry_text_end:
+
+SYM_CODE_START_LOCAL(common_interrupt_return)
SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
#ifdef CONFIG_DEBUG_ENTRY
/* Assert that pt_regs indicates user mode. */
@@ -662,23 +534,6 @@
INTERRUPT_RETURN
-/* Returning to kernel space */
-retint_kernel:
-#ifdef CONFIG_PREEMPTION
- /* Interrupts are off */
- /* Check if we need preemption */
- btl $9, EFLAGS(%rsp) /* were interrupts off? */
- jnc 1f
- cmpl $0, PER_CPU_VAR(__preempt_count)
- jnz 1f
- call preempt_schedule_irq
-1:
-#endif
- /*
- * The iretq could re-enable interrupts:
- */
- TRACE_IRQS_IRETQ
-
SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL)
#ifdef CONFIG_DEBUG_ENTRY
/* Assert that pt_regs indicates kernel mode. */
@@ -710,7 +565,7 @@
/*
* This may fault. Non-paranoid faults on return to userspace are
* handled by fixup_bad_iret. These include #SS, #GP, and #NP.
- * Double-faults due to espfix64 are handled in do_double_fault.
+ * Double-faults due to espfix64 are handled in exc_double_fault.
* Other faults here are fatal.
*/
iretq
@@ -788,280 +643,32 @@
*/
jmp native_irq_return_iret
#endif
-SYM_CODE_END(common_interrupt)
-_ASM_NOKPROBE(common_interrupt)
+SYM_CODE_END(common_interrupt_return)
+_ASM_NOKPROBE(common_interrupt_return)
/*
- * APIC interrupts.
+ * Reload gs selector with exception handling
+ * edi: new selector
+ *
+ * Is in entry.text as it shouldn't be instrumented.
*/
-.macro apicinterrupt3 num sym do_sym
-SYM_CODE_START(\sym)
- UNWIND_HINT_IRET_REGS
- pushq $~(\num)
-.Lcommon_\sym:
- call interrupt_entry
- UNWIND_HINT_REGS indirect=1
- call \do_sym /* rdi points to pt_regs */
- jmp ret_from_intr
-SYM_CODE_END(\sym)
-_ASM_NOKPROBE(\sym)
-.endm
-
-/* Make sure APIC interrupt handlers end up in the irqentry section: */
-#define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax"
-#define POP_SECTION_IRQENTRY .popsection
-
-.macro apicinterrupt num sym do_sym
-PUSH_SECTION_IRQENTRY
-apicinterrupt3 \num \sym \do_sym
-POP_SECTION_IRQENTRY
-.endm
-
-#ifdef CONFIG_SMP
-apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
-apicinterrupt3 REBOOT_VECTOR reboot_interrupt smp_reboot_interrupt
-#endif
-
-#ifdef CONFIG_X86_UV
-apicinterrupt3 UV_BAU_MESSAGE uv_bau_message_intr1 uv_bau_message_interrupt
-#endif
-
-apicinterrupt LOCAL_TIMER_VECTOR apic_timer_interrupt smp_apic_timer_interrupt
-apicinterrupt X86_PLATFORM_IPI_VECTOR x86_platform_ipi smp_x86_platform_ipi
-
-#ifdef CONFIG_HAVE_KVM
-apicinterrupt3 POSTED_INTR_VECTOR kvm_posted_intr_ipi smp_kvm_posted_intr_ipi
-apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi
-apicinterrupt3 POSTED_INTR_NESTED_VECTOR kvm_posted_intr_nested_ipi smp_kvm_posted_intr_nested_ipi
-#endif
-
-#ifdef CONFIG_X86_MCE_THRESHOLD
-apicinterrupt THRESHOLD_APIC_VECTOR threshold_interrupt smp_threshold_interrupt
-#endif
-
-#ifdef CONFIG_X86_MCE_AMD
-apicinterrupt DEFERRED_ERROR_VECTOR deferred_error_interrupt smp_deferred_error_interrupt
-#endif
-
-#ifdef CONFIG_X86_THERMAL_VECTOR
-apicinterrupt THERMAL_APIC_VECTOR thermal_interrupt smp_thermal_interrupt
-#endif
-
-#ifdef CONFIG_SMP
-apicinterrupt CALL_FUNCTION_SINGLE_VECTOR call_function_single_interrupt smp_call_function_single_interrupt
-apicinterrupt CALL_FUNCTION_VECTOR call_function_interrupt smp_call_function_interrupt
-apicinterrupt RESCHEDULE_VECTOR reschedule_interrupt smp_reschedule_interrupt
-#endif
-
-apicinterrupt ERROR_APIC_VECTOR error_interrupt smp_error_interrupt
-apicinterrupt SPURIOUS_APIC_VECTOR spurious_interrupt smp_spurious_interrupt
-
-#ifdef CONFIG_IRQ_WORK
-apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
-#endif
-
-/*
- * Exception entry points.
- */
-#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8)
-
-.macro idtentry_part do_sym, has_error_code:req, read_cr2:req, paranoid:req, shift_ist=-1, ist_offset=0
-
- .if \paranoid
- call paranoid_entry
- /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
- .else
- call error_entry
- .endif
- UNWIND_HINT_REGS
-
- .if \read_cr2
- /*
- * Store CR2 early so subsequent faults cannot clobber it. Use R12 as
- * intermediate storage as RDX can be clobbered in enter_from_user_mode().
- * GET_CR2_INTO can clobber RAX.
- */
- GET_CR2_INTO(%r12);
- .endif
-
- .if \shift_ist != -1
- TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */
- .else
- TRACE_IRQS_OFF
- .endif
-
- .if \paranoid == 0
- testb $3, CS(%rsp)
- jz .Lfrom_kernel_no_context_tracking_\@
- CALL_enter_from_user_mode
-.Lfrom_kernel_no_context_tracking_\@:
- .endif
-
- movq %rsp, %rdi /* pt_regs pointer */
-
- .if \has_error_code
- movq ORIG_RAX(%rsp), %rsi /* get error code */
- movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
- .else
- xorl %esi, %esi /* no error code */
- .endif
-
- .if \shift_ist != -1
- subq $\ist_offset, CPU_TSS_IST(\shift_ist)
- .endif
-
- .if \read_cr2
- movq %r12, %rdx /* Move CR2 into 3rd argument */
- .endif
-
- call \do_sym
-
- .if \shift_ist != -1
- addq $\ist_offset, CPU_TSS_IST(\shift_ist)
- .endif
-
- .if \paranoid
- /* this procedure expect "no swapgs" flag in ebx */
- jmp paranoid_exit
- .else
- jmp error_exit
- .endif
-
-.endm
-
-/**
- * idtentry - Generate an IDT entry stub
- * @sym: Name of the generated entry point
- * @do_sym: C function to be called
- * @has_error_code: True if this IDT vector has an error code on the stack
- * @paranoid: non-zero means that this vector may be invoked from
- * kernel mode with user GSBASE and/or user CR3.
- * 2 is special -- see below.
- * @shift_ist: Set to an IST index if entries from kernel mode should
- * decrement the IST stack so that nested entries get a
- * fresh stack. (This is for #DB, which has a nasty habit
- * of recursing.)
- * @create_gap: create a 6-word stack gap when coming from kernel mode.
- * @read_cr2: load CR2 into the 3rd argument; done before calling any C code
- *
- * idtentry generates an IDT stub that sets up a usable kernel context,
- * creates struct pt_regs, and calls @do_sym. The stub has the following
- * special behaviors:
- *
- * On an entry from user mode, the stub switches from the trampoline or
- * IST stack to the normal thread stack. On an exit to user mode, the
- * normal exit-to-usermode path is invoked.
- *
- * On an exit to kernel mode, if @paranoid == 0, we check for preemption,
- * whereas we omit the preemption check if @paranoid != 0. This is purely
- * because the implementation is simpler this way. The kernel only needs
- * to check for asynchronous kernel preemption when IRQ handlers return.
- *
- * If @paranoid == 0, then the stub will handle IRET faults by pretending
- * that the fault came from user mode. It will handle gs_change faults by
- * pretending that the fault happened with kernel GSBASE. Since this handling
- * is omitted for @paranoid != 0, the #GP, #SS, and #NP stubs must have
- * @paranoid == 0. This special handling will do the wrong thing for
- * espfix-induced #DF on IRET, so #DF must not use @paranoid == 0.
- *
- * @paranoid == 2 is special: the stub will never switch stacks. This is for
- * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
- */
-.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0 read_cr2=0
-SYM_CODE_START(\sym)
- UNWIND_HINT_IRET_REGS offset=\has_error_code*8
-
- /* Sanity check */
- .if \shift_ist != -1 && \paranoid != 1
- .error "using shift_ist requires paranoid=1"
- .endif
-
- .if \create_gap && \paranoid
- .error "using create_gap requires paranoid=0"
- .endif
-
- ASM_CLAC
-
- .if \has_error_code == 0
- pushq $-1 /* ORIG_RAX: no syscall to restart */
- .endif
-
- .if \paranoid == 1
- testb $3, CS-ORIG_RAX(%rsp) /* If coming from userspace, switch stacks */
- jnz .Lfrom_usermode_switch_stack_\@
- .endif
-
- .if \create_gap == 1
- /*
- * If coming from kernel space, create a 6-word gap to allow the
- * int3 handler to emulate a call instruction.
- */
- testb $3, CS-ORIG_RAX(%rsp)
- jnz .Lfrom_usermode_no_gap_\@
- .rept 6
- pushq 5*8(%rsp)
- .endr
- UNWIND_HINT_IRET_REGS offset=8
-.Lfrom_usermode_no_gap_\@:
- .endif
-
- idtentry_part \do_sym, \has_error_code, \read_cr2, \paranoid, \shift_ist, \ist_offset
-
- .if \paranoid == 1
- /*
- * Entry from userspace. Switch stacks and treat it
- * as a normal entry. This means that paranoid handlers
- * run in real process context if user_mode(regs).
- */
-.Lfrom_usermode_switch_stack_\@:
- idtentry_part \do_sym, \has_error_code, \read_cr2, paranoid=0
- .endif
-
-_ASM_NOKPROBE(\sym)
-SYM_CODE_END(\sym)
-.endm
-
-idtentry divide_error do_divide_error has_error_code=0
-idtentry overflow do_overflow has_error_code=0
-idtentry bounds do_bounds has_error_code=0
-idtentry invalid_op do_invalid_op has_error_code=0
-idtentry device_not_available do_device_not_available has_error_code=0
-idtentry double_fault do_double_fault has_error_code=1 paranoid=2 read_cr2=1
-idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
-idtentry invalid_TSS do_invalid_TSS has_error_code=1
-idtentry segment_not_present do_segment_not_present has_error_code=1
-idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0
-idtentry coprocessor_error do_coprocessor_error has_error_code=0
-idtentry alignment_check do_alignment_check has_error_code=1
-idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
-
-
- /*
- * Reload gs selector with exception handling
- * edi: new selector
- */
-SYM_FUNC_START(native_load_gs_index)
+SYM_FUNC_START(asm_load_gs_index)
FRAME_BEGIN
- pushfq
- DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
- TRACE_IRQS_OFF
- SWAPGS
+ swapgs
.Lgs_change:
movl %edi, %gs
2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
- SWAPGS
- TRACE_IRQS_FLAGS (%rsp)
- popfq
+ swapgs
FRAME_END
ret
-SYM_FUNC_END(native_load_gs_index)
-EXPORT_SYMBOL(native_load_gs_index)
+SYM_FUNC_END(asm_load_gs_index)
+EXPORT_SYMBOL(asm_load_gs_index)
_ASM_EXTABLE(.Lgs_change, .Lbad_gs)
.section .fixup, "ax"
/* running with kernelgs */
SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs)
- SWAPGS /* switch back to user gs */
+ swapgs /* switch back to user gs */
.macro ZAP_GS
/* This can't be a string because the preprocessor needs to see it. */
movl $__USER_DS, %eax
@@ -1074,20 +681,46 @@
SYM_CODE_END(.Lbad_gs)
.previous
-/* Call softirq on interrupt stack. Interrupts are off. */
-SYM_FUNC_START(do_softirq_own_stack)
- pushq %rbp
- mov %rsp, %rbp
- ENTER_IRQ_STACK regs=0 old_rsp=%r11
- call __do_softirq
- LEAVE_IRQ_STACK regs=0
+/*
+ * rdi: New stack pointer points to the top word of the stack
+ * rsi: Function pointer
+ * rdx: Function argument (can be NULL if none)
+ */
+SYM_FUNC_START(asm_call_on_stack)
+ /*
+ * Save the frame pointer unconditionally. This allows the ORC
+ * unwinder to handle the stack switch.
+ */
+ pushq %rbp
+ mov %rsp, %rbp
+
+ /*
+ * The unwinder relies on the word at the top of the new stack
+ * page linking back to the previous RSP.
+ */
+ mov %rsp, (%rdi)
+ mov %rdi, %rsp
+ /* Move the argument to the right place */
+ mov %rdx, %rdi
+
+1:
+ .pushsection .discard.instr_begin
+ .long 1b - .
+ .popsection
+
+ CALL_NOSPEC rsi
+
+2:
+ .pushsection .discard.instr_end
+ .long 2b - .
+ .popsection
+
+ /* Restore the previous stack pointer from RBP. */
leaveq
ret
-SYM_FUNC_END(do_softirq_own_stack)
+SYM_FUNC_END(asm_call_on_stack)
#ifdef CONFIG_XEN_PV
-idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0
-
/*
* A note on the "critical region" in our callback handler.
* We want to avoid stacking callback handlers due to events occurring
@@ -1100,9 +733,10 @@
* So, on entry to the handler we detect whether we interrupted an
* existing activation in its critical region -- if so, we pop the current
* activation and restart the handler using the previous one.
+ *
+ * C calling convention: exc_xen_hypervisor_callback(struct *pt_regs)
*/
-/* do_hypervisor_callback(struct *pt_regs) */
-SYM_CODE_START_LOCAL(xen_do_hypervisor_callback)
+SYM_CODE_START_LOCAL(exc_xen_hypervisor_callback)
/*
* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
@@ -1112,15 +746,10 @@
movq %rdi, %rsp /* we don't return, adjust the stack frame */
UNWIND_HINT_REGS
- ENTER_IRQ_STACK old_rsp=%r10
- call xen_evtchn_do_upcall
- LEAVE_IRQ_STACK
+ call xen_pv_evtchn_do_upcall
-#ifndef CONFIG_PREEMPTION
- call xen_maybe_preempt_hcall
-#endif
- jmp error_exit
-SYM_CODE_END(xen_do_hypervisor_callback)
+ jmp error_return
+SYM_CODE_END(exc_xen_hypervisor_callback)
/*
* Hypervisor uses this for application faults while it executes.
@@ -1155,7 +784,7 @@
addq $0x30, %rsp
pushq $0 /* RIP */
UNWIND_HINT_IRET_REGS offset=8
- jmp general_protection
+ jmp asm_exc_general_protection
1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
movq (%rsp), %rcx
movq 8(%rsp), %r11
@@ -1164,48 +793,10 @@
pushq $-1 /* orig_ax = -1 => not a system call */
PUSH_AND_CLEAR_REGS
ENCODE_FRAME_POINTER
- jmp error_exit
+ jmp error_return
SYM_CODE_END(xen_failsafe_callback)
#endif /* CONFIG_XEN_PV */
-#ifdef CONFIG_XEN_PVHVM
-apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
- xen_hvm_callback_vector xen_evtchn_do_upcall
-#endif
-
-
-#if IS_ENABLED(CONFIG_HYPERV)
-apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
- hyperv_callback_vector hyperv_vector_handler
-
-apicinterrupt3 HYPERV_REENLIGHTENMENT_VECTOR \
- hyperv_reenlightenment_vector hyperv_reenlightenment_intr
-
-apicinterrupt3 HYPERV_STIMER0_VECTOR \
- hv_stimer0_callback_vector hv_stimer0_vector_handler
-#endif /* CONFIG_HYPERV */
-
-#if IS_ENABLED(CONFIG_ACRN_GUEST)
-apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
- acrn_hv_callback_vector acrn_hv_vector_handler
-#endif
-
-idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=IST_INDEX_DB ist_offset=DB_STACK_OFFSET
-idtentry int3 do_int3 has_error_code=0 create_gap=1
-idtentry stack_segment do_stack_segment has_error_code=1
-
-#ifdef CONFIG_XEN_PV
-idtentry xennmi do_nmi has_error_code=0
-idtentry xendebug do_debug has_error_code=0
-#endif
-
-idtentry general_protection do_general_protection has_error_code=1
-idtentry page_fault do_page_fault has_error_code=1 read_cr2=1
-
-#ifdef CONFIG_X86_MCE
-idtentry machine_check do_mce has_error_code=0 paranoid=1
-#endif
-
/*
* Save all registers in pt_regs, and switch gs if needed.
* Use slow, but surefire "are we in kernel?" check.
@@ -1261,17 +852,13 @@
*/
SYM_CODE_START_LOCAL(paranoid_exit)
UNWIND_HINT_REGS
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF_DEBUG
testl %ebx, %ebx /* swapgs needed? */
jnz .Lparanoid_exit_no_swapgs
- TRACE_IRQS_IRETQ
/* Always restore stashed CR3 value (see paranoid_entry) */
RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
SWAPGS_UNSAFE_STACK
jmp restore_regs_and_return_to_kernel
.Lparanoid_exit_no_swapgs:
- TRACE_IRQS_IRETQ_DEBUG
/* Always restore stashed CR3 value (see paranoid_entry) */
RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
jmp restore_regs_and_return_to_kernel
@@ -1335,7 +922,6 @@
*/
SWAPGS
FENCE_SWAPGS_USER_ENTRY
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
jmp .Lerror_entry_done
.Lbstep_iret:
@@ -1362,14 +948,13 @@
jmp .Lerror_entry_from_usermode_after_swapgs
SYM_CODE_END(error_entry)
-SYM_CODE_START_LOCAL(error_exit)
+SYM_CODE_START_LOCAL(error_return)
UNWIND_HINT_REGS
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF
+ DEBUG_ENTRY_ASSERT_IRQS_OFF
testb $3, CS(%rsp)
- jz retint_kernel
- jmp .Lretint_user
-SYM_CODE_END(error_exit)
+ jz restore_regs_and_return_to_kernel
+ jmp swapgs_restore_regs_and_return_to_usermode
+SYM_CODE_END(error_return)
/*
* Runs on exception stack. Xen PV does not go through this path at all,
@@ -1379,7 +964,7 @@
* %r14: Used to save/restore the CR3 of the interrupted context
* when PAGE_TABLE_ISOLATION is in use. Do not clobber.
*/
-SYM_CODE_START(nmi)
+SYM_CODE_START(asm_exc_nmi)
UNWIND_HINT_IRET_REGS
/*
@@ -1464,7 +1049,7 @@
movq %rsp, %rdi
movq $-1, %rsi
- call do_nmi
+ call exc_nmi
/*
* Return back to user mode. We must *not* do the normal exit
@@ -1521,7 +1106,7 @@
* end_repeat_nmi, then we are a nested NMI. We must not
* modify the "iret" frame because it's being written by
* the outer NMI. That's okay; the outer NMI handler is
- * about to about to call do_nmi anyway, so we can just
+ * about to about to call exc_nmi() anyway, so we can just
* resume the outer NMI.
*/
@@ -1640,7 +1225,7 @@
* RSP is pointing to "outermost RIP". gsbase is unknown, but, if
* we're repeating an NMI, gsbase has the same value that it had on
* the first iteration. paranoid_entry will load the kernel
- * gsbase if needed before we call do_nmi. "NMI executing"
+ * gsbase if needed before we call exc_nmi(). "NMI executing"
* is zero.
*/
movq $1, 10*8(%rsp) /* Set "NMI executing". */
@@ -1674,10 +1259,9 @@
call paranoid_entry
UNWIND_HINT_REGS
- /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
movq %rsp, %rdi
movq $-1, %rsi
- call do_nmi
+ call exc_nmi
/* Always restore stashed CR3 value (see paranoid_entry) */
RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
@@ -1714,7 +1298,7 @@
* about espfix64 on the way back to kernel mode.
*/
iretq
-SYM_CODE_END(nmi)
+SYM_CODE_END(asm_exc_nmi)
#ifndef CONFIG_IA32_EMULATION
/*
@@ -1728,6 +1312,7 @@
SYM_CODE_END(ignore_sysret)
#endif
+.pushsection .text, "ax"
SYM_CODE_START(rewind_stack_do_exit)
UNWIND_HINT_FUNC
/* Prevent any naive code from trying to unwind to our caller. */
@@ -1739,3 +1324,4 @@
call do_exit
SYM_CODE_END(rewind_stack_do_exit)
+.popsection
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index f1d3cca..0f974ae 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -46,12 +46,14 @@
* ebp user stack
* 0(%ebp) arg6
*/
-SYM_FUNC_START(entry_SYSENTER_compat)
+SYM_CODE_START(entry_SYSENTER_compat)
+ UNWIND_HINT_EMPTY
/* Interrupts are off on entry. */
SWAPGS
- /* We are about to clobber %rsp anyway, clobbering here is OK */
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
+ pushq %rax
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
+ popq %rax
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -104,6 +106,9 @@
xorl %r14d, %r14d /* nospec r14 */
pushq $0 /* pt_regs->r15 = 0 */
xorl %r15d, %r15d /* nospec r15 */
+
+ UNWIND_HINT_REGS
+
cld
/*
@@ -129,17 +134,11 @@
jnz .Lsysenter_fix_flags
.Lsysenter_flags_fixed:
- /*
- * User mode is traced as though IRQs are on, and SYSENTER
- * turned them off.
- */
- TRACE_IRQS_OFF
-
movq %rsp, %rdi
call do_fast_syscall_32
/* XEN PV guests always use IRET path */
- ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
- "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
+ ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \
+ "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
jmp sysret32_from_system_call
.Lsysenter_fix_flags:
@@ -147,7 +146,7 @@
popfq
jmp .Lsysenter_flags_fixed
SYM_INNER_LABEL(__end_entry_SYSENTER_compat, SYM_L_GLOBAL)
-SYM_FUNC_END(entry_SYSENTER_compat)
+SYM_CODE_END(entry_SYSENTER_compat)
/*
* 32-bit SYSCALL entry.
@@ -197,6 +196,7 @@
* 0(%esp) arg6
*/
SYM_CODE_START(entry_SYSCALL_compat)
+ UNWIND_HINT_EMPTY
/* Interrupts are off on entry. */
swapgs
@@ -247,17 +247,13 @@
pushq $0 /* pt_regs->r15 = 0 */
xorl %r15d, %r15d /* nospec r15 */
- /*
- * User mode is traced as though IRQs are on, and SYSENTER
- * turned them off.
- */
- TRACE_IRQS_OFF
+ UNWIND_HINT_REGS
movq %rsp, %rdi
call do_fast_syscall_32
/* XEN PV guests always use IRET path */
- ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
- "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
+ ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \
+ "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
/* Opportunistic SYSRET */
sysret32_from_system_call:
@@ -266,7 +262,7 @@
* stack. So let's erase the thread stack right now.
*/
STACKLEAK_ERASE
- TRACE_IRQS_ON /* User mode traces as IRQs on. */
+
movq RBX(%rsp), %rbx /* pt_regs->rbx */
movq RBP(%rsp), %rbp /* pt_regs->rbp */
movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */
@@ -340,6 +336,7 @@
* ebp arg6
*/
SYM_CODE_START(entry_INT80_compat)
+ UNWIND_HINT_EMPTY
/*
* Interrupts are off on entry.
*/
@@ -361,8 +358,11 @@
/* Need to switch before accessing the thread stack. */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
+
/* In the Xen PV case we already run on the thread stack. */
- ALTERNATIVE "movq %rsp, %rdi", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV
+ ALTERNATIVE "", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV
+
+ movq %rsp, %rdi
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
pushq 6*8(%rdi) /* regs->ss */
@@ -401,19 +401,12 @@
xorl %r14d, %r14d /* nospec r14 */
pushq %r15 /* pt_regs->r15 */
xorl %r15d, %r15d /* nospec r15 */
- cld
- /*
- * User mode is traced as though IRQs are on, and the interrupt
- * gate turned them off.
- */
- TRACE_IRQS_OFF
+ UNWIND_HINT_REGS
+
+ cld
movq %rsp, %rdi
call do_int80_syscall_32
-.Lsyscall_32_done:
-
- /* Go back to user mode. */
- TRACE_IRQS_ON
jmp swapgs_restore_regs_and_return_to_usermode
SYM_CODE_END(entry_INT80_compat)
diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S
index dbe4493..ccd3287 100644
--- a/arch/x86/entry/thunk_64.S
+++ b/arch/x86/entry/thunk_64.S
@@ -3,7 +3,6 @@
* Save registers before calling assembly functions. This avoids
* disturbance of register allocation in some inline assembly constructs.
* Copyright 2001,2002 by Andi Kleen, SuSE Labs.
- * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc.
*/
#include <linux/linkage.h>
#include "calling.h"
@@ -37,15 +36,6 @@
_ASM_NOKPROBE(\name)
.endm
-#ifdef CONFIG_TRACE_IRQFLAGS
- THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1
- THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1
-#endif
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- THUNK lockdep_sys_exit_thunk,lockdep_sys_exit
-#endif
-
#ifdef CONFIG_PREEMPTION
THUNK preempt_schedule_thunk, preempt_schedule
THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace
@@ -53,9 +43,7 @@
EXPORT_SYMBOL(preempt_schedule_notrace_thunk)
#endif
-#if defined(CONFIG_TRACE_IRQFLAGS) \
- || defined(CONFIG_DEBUG_LOCK_ALLOC) \
- || defined(CONFIG_PREEMPTION)
+#ifdef CONFIG_PREEMPTION
SYM_CODE_START_LOCAL_NOALIGN(.L_restore)
popq %r11
popq %r10
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index e213707..a54c6a4 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -15,6 +15,7 @@
#include <asm/hypervisor.h>
#include <asm/hyperv-tlfs.h>
#include <asm/mshyperv.h>
+#include <asm/idtentry.h>
#include <linux/version.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
@@ -152,15 +153,11 @@
ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT;
}
-__visible void __irq_entry hyperv_reenlightenment_intr(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_reenlightenment)
{
- entering_ack_irq();
-
+ ack_APIC_irq();
inc_irq_stat(irq_hv_reenlightenment_count);
-
schedule_delayed_work(&hv_reenlightenment_work, HZ/10);
-
- exiting_irq();
}
void set_hv_tscchange_cb(void (*cb)(void))
diff --git a/arch/x86/include/asm/acrn.h b/arch/x86/include/asm/acrn.h
deleted file mode 100644
index 4adb13f..0000000
--- a/arch/x86/include/asm/acrn.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_ACRN_H
-#define _ASM_X86_ACRN_H
-
-extern void acrn_hv_callback_vector(void);
-#ifdef CONFIG_TRACING
-#define trace_acrn_hv_callback_vector acrn_hv_callback_vector
-#endif
-
-extern void acrn_hv_vector_handler(struct pt_regs *regs);
-#endif /* _ASM_X86_ACRN_H */
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index c7df20e..455066a 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -57,6 +57,7 @@
/* initialized to the number of CPUs on the node sharing this bank */
refcount_t cpus;
+ unsigned int shared;
};
struct amd_northbridge {
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 19e94af..2cc44e9 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -519,39 +519,6 @@
static inline void apic_smt_update(void) { }
#endif
-extern void irq_enter(void);
-extern void irq_exit(void);
-
-static inline void entering_irq(void)
-{
- irq_enter();
- kvm_set_cpu_l1tf_flush_l1d();
-}
-
-static inline void entering_ack_irq(void)
-{
- entering_irq();
- ack_APIC_irq();
-}
-
-static inline void ipi_entering_ack_irq(void)
-{
- irq_enter();
- ack_APIC_irq();
- kvm_set_cpu_l1tf_flush_l1d();
-}
-
-static inline void exiting_irq(void)
-{
- irq_exit();
-}
-
-static inline void exiting_ack_irq(void)
-{
- ack_APIC_irq();
- irq_exit();
-}
-
extern void ioapic_zap_locks(void);
#endif /* _ASM_X86_APIC_H */
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index a9ae588..bf35e47 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -205,13 +205,13 @@
}
#define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg
-static inline int arch_atomic_xchg(atomic_t *v, int new)
+static __always_inline int arch_atomic_xchg(atomic_t *v, int new)
{
return arch_xchg(&v->counter, new);
}
#define arch_atomic_xchg arch_atomic_xchg
-static inline void arch_atomic_and(int i, atomic_t *v)
+static __always_inline void arch_atomic_and(int i, atomic_t *v)
{
asm volatile(LOCK_PREFIX "andl %1,%0"
: "+m" (v->counter)
@@ -219,7 +219,7 @@
: "memory");
}
-static inline int arch_atomic_fetch_and(int i, atomic_t *v)
+static __always_inline int arch_atomic_fetch_and(int i, atomic_t *v)
{
int val = arch_atomic_read(v);
@@ -229,7 +229,7 @@
}
#define arch_atomic_fetch_and arch_atomic_fetch_and
-static inline void arch_atomic_or(int i, atomic_t *v)
+static __always_inline void arch_atomic_or(int i, atomic_t *v)
{
asm volatile(LOCK_PREFIX "orl %1,%0"
: "+m" (v->counter)
@@ -237,7 +237,7 @@
: "memory");
}
-static inline int arch_atomic_fetch_or(int i, atomic_t *v)
+static __always_inline int arch_atomic_fetch_or(int i, atomic_t *v)
{
int val = arch_atomic_read(v);
@@ -247,7 +247,7 @@
}
#define arch_atomic_fetch_or arch_atomic_fetch_or
-static inline void arch_atomic_xor(int i, atomic_t *v)
+static __always_inline void arch_atomic_xor(int i, atomic_t *v)
{
asm volatile(LOCK_PREFIX "xorl %1,%0"
: "+m" (v->counter)
@@ -255,7 +255,7 @@
: "memory");
}
-static inline int arch_atomic_fetch_xor(int i, atomic_t *v)
+static __always_inline int arch_atomic_fetch_xor(int i, atomic_t *v)
{
int val = arch_atomic_read(v);
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index facba9b..fb34ff6 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -70,14 +70,17 @@
#define HAVE_ARCH_BUG
#define BUG() \
do { \
+ instrumentation_begin(); \
_BUG_FLAGS(ASM_UD2, 0); \
unreachable(); \
} while (0)
#define __WARN_FLAGS(flags) \
do { \
+ instrumentation_begin(); \
_BUG_FLAGS(ASM_UD2, BUGFLAG_WARNING|(flags)); \
annotate_reachable(); \
+ instrumentation_end(); \
} while (0)
#include <asm-generic/bug.h>
diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
index 02c0078..8902fdb 100644
--- a/arch/x86/include/asm/cpu_entry_area.h
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -11,15 +11,11 @@
#ifdef CONFIG_X86_64
/* Macro to enforce the same ordering and stack sizes */
-#define ESTACKS_MEMBERS(guardsize, db2_holesize)\
+#define ESTACKS_MEMBERS(guardsize) \
char DF_stack_guard[guardsize]; \
char DF_stack[EXCEPTION_STKSZ]; \
char NMI_stack_guard[guardsize]; \
char NMI_stack[EXCEPTION_STKSZ]; \
- char DB2_stack_guard[guardsize]; \
- char DB2_stack[db2_holesize]; \
- char DB1_stack_guard[guardsize]; \
- char DB1_stack[EXCEPTION_STKSZ]; \
char DB_stack_guard[guardsize]; \
char DB_stack[EXCEPTION_STKSZ]; \
char MCE_stack_guard[guardsize]; \
@@ -28,12 +24,12 @@
/* The exception stacks' physical storage. No guard pages required */
struct exception_stacks {
- ESTACKS_MEMBERS(0, 0)
+ ESTACKS_MEMBERS(0)
};
/* The effective cpu entry area mapping with guard pages. */
struct cea_exception_stacks {
- ESTACKS_MEMBERS(PAGE_SIZE, EXCEPTION_STKSZ)
+ ESTACKS_MEMBERS(PAGE_SIZE)
};
/*
@@ -42,8 +38,6 @@
enum exception_stack_ordering {
ESTACK_DF,
ESTACK_NMI,
- ESTACK_DB2,
- ESTACK_DB1,
ESTACK_DB,
ESTACK_MCE,
N_EXCEPTION_STACKS
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 1a8609a..e89558a 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -18,7 +18,7 @@
native_set_debugreg(register, value)
#endif
-static inline unsigned long native_get_debugreg(int regno)
+static __always_inline unsigned long native_get_debugreg(int regno)
{
unsigned long val = 0; /* Damn you, gcc! */
@@ -47,7 +47,7 @@
return val;
}
-static inline void native_set_debugreg(int regno, unsigned long value)
+static __always_inline void native_set_debugreg(int regno, unsigned long value)
{
switch (regno) {
case 0:
@@ -85,7 +85,7 @@
set_debugreg(0UL, 3);
}
-static inline int hw_breakpoint_active(void)
+static __always_inline bool hw_breakpoint_active(void)
{
return __this_cpu_read(cpu_dr7) & DR_GLOBAL_ENABLE_MASK;
}
@@ -94,24 +94,38 @@
extern void hw_breakpoint_restore(void);
-#ifdef CONFIG_X86_64
-DECLARE_PER_CPU(int, debug_stack_usage);
-static inline void debug_stack_usage_inc(void)
+static __always_inline unsigned long local_db_save(void)
{
- __this_cpu_inc(debug_stack_usage);
+ unsigned long dr7;
+
+ if (static_cpu_has(X86_FEATURE_HYPERVISOR) && !hw_breakpoint_active())
+ return 0;
+
+ get_debugreg(dr7, 7);
+ dr7 &= ~0x400; /* architecturally set bit */
+ if (dr7)
+ set_debugreg(0, 7);
+ /*
+ * Ensure the compiler doesn't lower the above statements into
+ * the critical section; disabling breakpoints late would not
+ * be good.
+ */
+ barrier();
+
+ return dr7;
}
-static inline void debug_stack_usage_dec(void)
+
+static __always_inline void local_db_restore(unsigned long dr7)
{
- __this_cpu_dec(debug_stack_usage);
+ /*
+ * Ensure the compiler doesn't raise this statement into
+ * the critical section; enabling breakpoints early would
+ * not be good.
+ */
+ barrier();
+ if (dr7)
+ set_debugreg(dr7, 7);
}
-void debug_stack_set_zero(void);
-void debug_stack_reset(void);
-#else /* !X86_64 */
-static inline void debug_stack_set_zero(void) { }
-static inline void debug_stack_reset(void) { }
-static inline void debug_stack_usage_inc(void) { }
-static inline void debug_stack_usage_dec(void) { }
-#endif /* X86_64 */
#ifdef CONFIG_CPU_SUP_AMD
extern void set_dr_addr_mask(unsigned long mask, int dr);
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 68a99d2..1ced11d 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -40,11 +40,6 @@
desc->l = 0;
}
-extern struct desc_ptr idt_descr;
-extern gate_desc idt_table[];
-extern const struct desc_ptr debug_idt_descr;
-extern gate_desc debug_idt_table[];
-
struct gdt_page {
struct desc_struct gdt[GDT_ENTRIES];
} __attribute__((aligned(PAGE_SIZE)));
@@ -214,7 +209,7 @@
asm volatile("lgdt %0"::"m" (*dtr));
}
-static inline void native_load_idt(const struct desc_ptr *dtr)
+static __always_inline void native_load_idt(const struct desc_ptr *dtr)
{
asm volatile("lidt %0"::"m" (*dtr));
}
@@ -386,64 +381,23 @@
desc->limit1 = (limit >> 16) & 0xf;
}
-void update_intr_gate(unsigned int n, const void *addr);
void alloc_intr_gate(unsigned int n, const void *addr);
extern unsigned long system_vectors[];
-#ifdef CONFIG_X86_64
-DECLARE_PER_CPU(u32, debug_idt_ctr);
-static inline bool is_debug_idt_enabled(void)
-{
- if (this_cpu_read(debug_idt_ctr))
- return true;
-
- return false;
-}
-
-static inline void load_debug_idt(void)
-{
- load_idt((const struct desc_ptr *)&debug_idt_descr);
-}
-#else
-static inline bool is_debug_idt_enabled(void)
-{
- return false;
-}
-
-static inline void load_debug_idt(void)
-{
-}
-#endif
-
-/*
- * The load_current_idt() must be called with interrupts disabled
- * to avoid races. That way the IDT will always be set back to the expected
- * descriptor. It's also called when a CPU is being initialized, and
- * that doesn't need to disable interrupts, as nothing should be
- * bothering the CPU then.
- */
-static inline void load_current_idt(void)
-{
- if (is_debug_idt_enabled())
- load_debug_idt();
- else
- load_idt((const struct desc_ptr *)&idt_descr);
-}
-
+extern void load_current_idt(void);
extern void idt_setup_early_handler(void);
extern void idt_setup_early_traps(void);
extern void idt_setup_traps(void);
extern void idt_setup_apic_and_irq_gates(void);
+extern bool idt_is_f00f_address(unsigned long address);
#ifdef CONFIG_X86_64
extern void idt_setup_early_pf(void);
extern void idt_setup_ist_traps(void);
-extern void idt_setup_debugidt_traps(void);
#else
static inline void idt_setup_early_pf(void) { }
static inline void idt_setup_ist_traps(void) { }
-static inline void idt_setup_debugidt_traps(void) { }
#endif
extern void idt_invalidate(void *addr);
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
deleted file mode 100644
index 4164227..0000000
--- a/arch/x86/include/asm/entry_arch.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * This file is designed to contain the BUILD_INTERRUPT specifications for
- * all of the extra named interrupt vectors used by the architecture.
- * Usually this is the Inter Process Interrupts (IPIs)
- */
-
-/*
- * The following vectors are part of the Linux architecture, there
- * is no hardware IRQ pin equivalent for them, they are triggered
- * through the ICC by us (IPIs)
- */
-#ifdef CONFIG_SMP
-BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
-BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
-BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
-BUILD_INTERRUPT(irq_move_cleanup_interrupt, IRQ_MOVE_CLEANUP_VECTOR)
-BUILD_INTERRUPT(reboot_interrupt, REBOOT_VECTOR)
-#endif
-
-#ifdef CONFIG_HAVE_KVM
-BUILD_INTERRUPT(kvm_posted_intr_ipi, POSTED_INTR_VECTOR)
-BUILD_INTERRUPT(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR)
-BUILD_INTERRUPT(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR)
-#endif
-
-/*
- * every pentium local APIC has two 'local interrupts', with a
- * soft-definable vector attached to both interrupts, one of
- * which is a timer interrupt, the other one is error counter
- * overflow. Linux uses the local APIC timer interrupt to get
- * a much simpler SMP time architecture:
- */
-#ifdef CONFIG_X86_LOCAL_APIC
-
-BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
-BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
-BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
-BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
-
-#ifdef CONFIG_IRQ_WORK
-BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR)
-#endif
-
-#ifdef CONFIG_X86_THERMAL_VECTOR
-BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
-#endif
-
-#ifdef CONFIG_X86_MCE_THRESHOLD
-BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
-#endif
-
-#ifdef CONFIG_X86_MCE_AMD
-BUILD_INTERRUPT(deferred_error_interrupt, DEFERRED_ERROR_VECTOR)
-#endif
-#endif
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 4154bc5..74c1243 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -28,28 +28,6 @@
#include <asm/irq.h>
#include <asm/sections.h>
-/* Interrupt handlers registered during init_IRQ */
-extern asmlinkage void apic_timer_interrupt(void);
-extern asmlinkage void x86_platform_ipi(void);
-extern asmlinkage void kvm_posted_intr_ipi(void);
-extern asmlinkage void kvm_posted_intr_wakeup_ipi(void);
-extern asmlinkage void kvm_posted_intr_nested_ipi(void);
-extern asmlinkage void error_interrupt(void);
-extern asmlinkage void irq_work_interrupt(void);
-extern asmlinkage void uv_bau_message_intr1(void);
-
-extern asmlinkage void spurious_interrupt(void);
-extern asmlinkage void thermal_interrupt(void);
-extern asmlinkage void reschedule_interrupt(void);
-
-extern asmlinkage void irq_move_cleanup_interrupt(void);
-extern asmlinkage void reboot_interrupt(void);
-extern asmlinkage void threshold_interrupt(void);
-extern asmlinkage void deferred_error_interrupt(void);
-
-extern asmlinkage void call_function_interrupt(void);
-extern asmlinkage void call_function_single_interrupt(void);
-
#ifdef CONFIG_X86_LOCAL_APIC
struct irq_data;
struct pci_dev;
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
new file mode 100644
index 0000000..cf51c50
--- /dev/null
+++ b/arch/x86/include/asm/idtentry.h
@@ -0,0 +1,652 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_IDTENTRY_H
+#define _ASM_X86_IDTENTRY_H
+
+/* Interrupts/Exceptions */
+#include <asm/trapnr.h>
+
+#ifndef __ASSEMBLY__
+#include <linux/hardirq.h>
+
+#include <asm/irq_stack.h>
+
+void idtentry_enter_user(struct pt_regs *regs);
+void idtentry_exit_user(struct pt_regs *regs);
+
+bool idtentry_enter_cond_rcu(struct pt_regs *regs);
+void idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit);
+
+/**
+ * DECLARE_IDTENTRY - Declare functions for simple IDT entry points
+ * No error code pushed by hardware
+ * @vector: Vector number (ignored for C)
+ * @func: Function name of the entry point
+ *
+ * Declares three functions:
+ * - The ASM entry point: asm_##func
+ * - The XEN PV trap entry point: xen_##func (maybe unused)
+ * - The C handler called from the ASM entry point
+ *
+ * Note: This is the C variant of DECLARE_IDTENTRY(). As the name says it
+ * declares the entry points for usage in C code. There is an ASM variant
+ * as well which is used to emit the entry stubs in entry_32/64.S.
+ */
+#define DECLARE_IDTENTRY(vector, func) \
+ asmlinkage void asm_##func(void); \
+ asmlinkage void xen_asm_##func(void); \
+ __visible void func(struct pt_regs *regs)
+
+/**
+ * DEFINE_IDTENTRY - Emit code for simple IDT entry points
+ * @func: Function name of the entry point
+ *
+ * @func is called from ASM entry code with interrupts disabled.
+ *
+ * The macro is written so it acts as function definition. Append the
+ * body with a pair of curly brackets.
+ *
+ * idtentry_enter() contains common code which has to be invoked before
+ * arbitrary code in the body. idtentry_exit() contains common code
+ * which has to run before returning to the low level assembly code.
+ */
+#define DEFINE_IDTENTRY(func) \
+static __always_inline void __##func(struct pt_regs *regs); \
+ \
+__visible noinstr void func(struct pt_regs *regs) \
+{ \
+ bool rcu_exit = idtentry_enter_cond_rcu(regs); \
+ \
+ instrumentation_begin(); \
+ __##func (regs); \
+ instrumentation_end(); \
+ idtentry_exit_cond_rcu(regs, rcu_exit); \
+} \
+ \
+static __always_inline void __##func(struct pt_regs *regs)
+
+/* Special case for 32bit IRET 'trap' */
+#define DECLARE_IDTENTRY_SW DECLARE_IDTENTRY
+#define DEFINE_IDTENTRY_SW DEFINE_IDTENTRY
+
+/**
+ * DECLARE_IDTENTRY_ERRORCODE - Declare functions for simple IDT entry points
+ * Error code pushed by hardware
+ * @vector: Vector number (ignored for C)
+ * @func: Function name of the entry point
+ *
+ * Declares three functions:
+ * - The ASM entry point: asm_##func
+ * - The XEN PV trap entry point: xen_##func (maybe unused)
+ * - The C handler called from the ASM entry point
+ *
+ * Same as DECLARE_IDTENTRY, but has an extra error_code argument for the
+ * C-handler.
+ */
+#define DECLARE_IDTENTRY_ERRORCODE(vector, func) \
+ asmlinkage void asm_##func(void); \
+ asmlinkage void xen_asm_##func(void); \
+ __visible void func(struct pt_regs *regs, unsigned long error_code)
+
+/**
+ * DEFINE_IDTENTRY_ERRORCODE - Emit code for simple IDT entry points
+ * Error code pushed by hardware
+ * @func: Function name of the entry point
+ *
+ * Same as DEFINE_IDTENTRY, but has an extra error_code argument
+ */
+#define DEFINE_IDTENTRY_ERRORCODE(func) \
+static __always_inline void __##func(struct pt_regs *regs, \
+ unsigned long error_code); \
+ \
+__visible noinstr void func(struct pt_regs *regs, \
+ unsigned long error_code) \
+{ \
+ bool rcu_exit = idtentry_enter_cond_rcu(regs); \
+ \
+ instrumentation_begin(); \
+ __##func (regs, error_code); \
+ instrumentation_end(); \
+ idtentry_exit_cond_rcu(regs, rcu_exit); \
+} \
+ \
+static __always_inline void __##func(struct pt_regs *regs, \
+ unsigned long error_code)
+
+/**
+ * DECLARE_IDTENTRY_RAW - Declare functions for raw IDT entry points
+ * No error code pushed by hardware
+ * @vector: Vector number (ignored for C)
+ * @func: Function name of the entry point
+ *
+ * Maps to DECLARE_IDTENTRY().
+ */
+#define DECLARE_IDTENTRY_RAW(vector, func) \
+ DECLARE_IDTENTRY(vector, func)
+
+/**
+ * DEFINE_IDTENTRY_RAW - Emit code for raw IDT entry points
+ * @func: Function name of the entry point
+ *
+ * @func is called from ASM entry code with interrupts disabled.
+ *
+ * The macro is written so it acts as function definition. Append the
+ * body with a pair of curly brackets.
+ *
+ * Contrary to DEFINE_IDTENTRY() this does not invoke the
+ * idtentry_enter/exit() helpers before and after the body invocation. This
+ * needs to be done in the body itself if applicable. Use if extra work
+ * is required before the enter/exit() helpers are invoked.
+ */
+#define DEFINE_IDTENTRY_RAW(func) \
+__visible noinstr void func(struct pt_regs *regs)
+
+/**
+ * DECLARE_IDTENTRY_RAW_ERRORCODE - Declare functions for raw IDT entry points
+ * Error code pushed by hardware
+ * @vector: Vector number (ignored for C)
+ * @func: Function name of the entry point
+ *
+ * Maps to DECLARE_IDTENTRY_ERRORCODE()
+ */
+#define DECLARE_IDTENTRY_RAW_ERRORCODE(vector, func) \
+ DECLARE_IDTENTRY_ERRORCODE(vector, func)
+
+/**
+ * DEFINE_IDTENTRY_RAW_ERRORCODE - Emit code for raw IDT entry points
+ * @func: Function name of the entry point
+ *
+ * @func is called from ASM entry code with interrupts disabled.
+ *
+ * The macro is written so it acts as function definition. Append the
+ * body with a pair of curly brackets.
+ *
+ * Contrary to DEFINE_IDTENTRY_ERRORCODE() this does not invoke the
+ * idtentry_enter/exit() helpers before and after the body invocation. This
+ * needs to be done in the body itself if applicable. Use if extra work
+ * is required before the enter/exit() helpers are invoked.
+ */
+#define DEFINE_IDTENTRY_RAW_ERRORCODE(func) \
+__visible noinstr void func(struct pt_regs *regs, unsigned long error_code)
+
+/**
+ * DECLARE_IDTENTRY_IRQ - Declare functions for device interrupt IDT entry
+ * points (common/spurious)
+ * @vector: Vector number (ignored for C)
+ * @func: Function name of the entry point
+ *
+ * Maps to DECLARE_IDTENTRY_ERRORCODE()
+ */
+#define DECLARE_IDTENTRY_IRQ(vector, func) \
+ DECLARE_IDTENTRY_ERRORCODE(vector, func)
+
+/**
+ * DEFINE_IDTENTRY_IRQ - Emit code for device interrupt IDT entry points
+ * @func: Function name of the entry point
+ *
+ * The vector number is pushed by the low level entry stub and handed
+ * to the function as error_code argument which needs to be truncated
+ * to an u8 because the push is sign extending.
+ *
+ * On 64-bit idtentry_enter/exit() are invoked in the ASM entry code before
+ * and after switching to the interrupt stack. On 32-bit this happens in C.
+ *
+ * irq_enter/exit_rcu() are invoked before the function body and the
+ * KVM L1D flush request is set.
+ */
+#define DEFINE_IDTENTRY_IRQ(func) \
+static __always_inline void __##func(struct pt_regs *regs, u8 vector); \
+ \
+__visible noinstr void func(struct pt_regs *regs, \
+ unsigned long error_code) \
+{ \
+ bool rcu_exit = idtentry_enter_cond_rcu(regs); \
+ \
+ instrumentation_begin(); \
+ irq_enter_rcu(); \
+ kvm_set_cpu_l1tf_flush_l1d(); \
+ __##func (regs, (u8)error_code); \
+ irq_exit_rcu(); \
+ instrumentation_end(); \
+ idtentry_exit_cond_rcu(regs, rcu_exit); \
+} \
+ \
+static __always_inline void __##func(struct pt_regs *regs, u8 vector)
+
+/**
+ * DECLARE_IDTENTRY_SYSVEC - Declare functions for system vector entry points
+ * @vector: Vector number (ignored for C)
+ * @func: Function name of the entry point
+ *
+ * Declares three functions:
+ * - The ASM entry point: asm_##func
+ * - The XEN PV trap entry point: xen_##func (maybe unused)
+ * - The C handler called from the ASM entry point
+ *
+ * Maps to DECLARE_IDTENTRY().
+ */
+#define DECLARE_IDTENTRY_SYSVEC(vector, func) \
+ DECLARE_IDTENTRY(vector, func)
+
+/**
+ * DEFINE_IDTENTRY_SYSVEC - Emit code for system vector IDT entry points
+ * @func: Function name of the entry point
+ *
+ * idtentry_enter/exit() and irq_enter/exit_rcu() are invoked before the
+ * function body. KVM L1D flush request is set.
+ *
+ * Runs the function on the interrupt stack if the entry hit kernel mode
+ */
+#define DEFINE_IDTENTRY_SYSVEC(func) \
+static void __##func(struct pt_regs *regs); \
+ \
+__visible noinstr void func(struct pt_regs *regs) \
+{ \
+ bool rcu_exit = idtentry_enter_cond_rcu(regs); \
+ \
+ instrumentation_begin(); \
+ irq_enter_rcu(); \
+ kvm_set_cpu_l1tf_flush_l1d(); \
+ run_on_irqstack_cond(__##func, regs, regs); \
+ irq_exit_rcu(); \
+ instrumentation_end(); \
+ idtentry_exit_cond_rcu(regs, rcu_exit); \
+} \
+ \
+static noinline void __##func(struct pt_regs *regs)
+
+/**
+ * DEFINE_IDTENTRY_SYSVEC_SIMPLE - Emit code for simple system vector IDT
+ * entry points
+ * @func: Function name of the entry point
+ *
+ * Runs the function on the interrupted stack. No switch to IRQ stack and
+ * only the minimal __irq_enter/exit() handling.
+ *
+ * Only use for 'empty' vectors like reschedule IPI and KVM posted
+ * interrupt vectors.
+ */
+#define DEFINE_IDTENTRY_SYSVEC_SIMPLE(func) \
+static __always_inline void __##func(struct pt_regs *regs); \
+ \
+__visible noinstr void func(struct pt_regs *regs) \
+{ \
+ bool rcu_exit = idtentry_enter_cond_rcu(regs); \
+ \
+ instrumentation_begin(); \
+ __irq_enter_raw(); \
+ kvm_set_cpu_l1tf_flush_l1d(); \
+ __##func (regs); \
+ __irq_exit_raw(); \
+ instrumentation_end(); \
+ idtentry_exit_cond_rcu(regs, rcu_exit); \
+} \
+ \
+static __always_inline void __##func(struct pt_regs *regs)
+
+/**
+ * DECLARE_IDTENTRY_XENCB - Declare functions for XEN HV callback entry point
+ * @vector: Vector number (ignored for C)
+ * @func: Function name of the entry point
+ *
+ * Declares three functions:
+ * - The ASM entry point: asm_##func
+ * - The XEN PV trap entry point: xen_##func (maybe unused)
+ * - The C handler called from the ASM entry point
+ *
+ * Maps to DECLARE_IDTENTRY(). Distinct entry point to handle the 32/64-bit
+ * difference
+ */
+#define DECLARE_IDTENTRY_XENCB(vector, func) \
+ DECLARE_IDTENTRY(vector, func)
+
+#ifdef CONFIG_X86_64
+/**
+ * DECLARE_IDTENTRY_IST - Declare functions for IST handling IDT entry points
+ * @vector: Vector number (ignored for C)
+ * @func: Function name of the entry point
+ *
+ * Maps to DECLARE_IDTENTRY_RAW, but declares also the NOIST C handler
+ * which is called from the ASM entry point on user mode entry
+ */
+#define DECLARE_IDTENTRY_IST(vector, func) \
+ DECLARE_IDTENTRY_RAW(vector, func); \
+ __visible void noist_##func(struct pt_regs *regs)
+
+/**
+ * DEFINE_IDTENTRY_IST - Emit code for IST entry points
+ * @func: Function name of the entry point
+ *
+ * Maps to DEFINE_IDTENTRY_RAW
+ */
+#define DEFINE_IDTENTRY_IST(func) \
+ DEFINE_IDTENTRY_RAW(func)
+
+/**
+ * DEFINE_IDTENTRY_NOIST - Emit code for NOIST entry points which
+ * belong to a IST entry point (MCE, DB)
+ * @func: Function name of the entry point. Must be the same as
+ * the function name of the corresponding IST variant
+ *
+ * Maps to DEFINE_IDTENTRY_RAW().
+ */
+#define DEFINE_IDTENTRY_NOIST(func) \
+ DEFINE_IDTENTRY_RAW(noist_##func)
+
+/**
+ * DECLARE_IDTENTRY_DF - Declare functions for double fault
+ * @vector: Vector number (ignored for C)
+ * @func: Function name of the entry point
+ *
+ * Maps to DECLARE_IDTENTRY_RAW_ERRORCODE
+ */
+#define DECLARE_IDTENTRY_DF(vector, func) \
+ DECLARE_IDTENTRY_RAW_ERRORCODE(vector, func)
+
+/**
+ * DEFINE_IDTENTRY_DF - Emit code for double fault
+ * @func: Function name of the entry point
+ *
+ * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE
+ */
+#define DEFINE_IDTENTRY_DF(func) \
+ DEFINE_IDTENTRY_RAW_ERRORCODE(func)
+
+#else /* CONFIG_X86_64 */
+
+/* Maps to a regular IDTENTRY on 32bit for now */
+# define DECLARE_IDTENTRY_IST DECLARE_IDTENTRY
+# define DEFINE_IDTENTRY_IST DEFINE_IDTENTRY
+
+/**
+ * DECLARE_IDTENTRY_DF - Declare functions for double fault 32bit variant
+ * @vector: Vector number (ignored for C)
+ * @func: Function name of the entry point
+ *
+ * Declares two functions:
+ * - The ASM entry point: asm_##func
+ * - The C handler called from the C shim
+ */
+#define DECLARE_IDTENTRY_DF(vector, func) \
+ asmlinkage void asm_##func(void); \
+ __visible void func(struct pt_regs *regs, \
+ unsigned long error_code, \
+ unsigned long address)
+
+/**
+ * DEFINE_IDTENTRY_DF - Emit code for double fault on 32bit
+ * @func: Function name of the entry point
+ *
+ * This is called through the doublefault shim which already provides
+ * cr2 in the address argument.
+ */
+#define DEFINE_IDTENTRY_DF(func) \
+__visible noinstr void func(struct pt_regs *regs, \
+ unsigned long error_code, \
+ unsigned long address)
+
+#endif /* !CONFIG_X86_64 */
+
+/* C-Code mapping */
+#define DECLARE_IDTENTRY_MCE DECLARE_IDTENTRY_IST
+#define DEFINE_IDTENTRY_MCE DEFINE_IDTENTRY_IST
+#define DEFINE_IDTENTRY_MCE_USER DEFINE_IDTENTRY_NOIST
+
+#define DECLARE_IDTENTRY_NMI DECLARE_IDTENTRY_RAW
+#define DEFINE_IDTENTRY_NMI DEFINE_IDTENTRY_RAW
+
+#define DECLARE_IDTENTRY_DEBUG DECLARE_IDTENTRY_IST
+#define DEFINE_IDTENTRY_DEBUG DEFINE_IDTENTRY_IST
+#define DEFINE_IDTENTRY_DEBUG_USER DEFINE_IDTENTRY_NOIST
+
+/**
+ * DECLARE_IDTENTRY_XEN - Declare functions for XEN redirect IDT entry points
+ * @vector: Vector number (ignored for C)
+ * @func: Function name of the entry point
+ *
+ * Used for xennmi and xendebug redirections. No DEFINE as this is all ASM
+ * indirection magic.
+ */
+#define DECLARE_IDTENTRY_XEN(vector, func) \
+ asmlinkage void xen_asm_exc_xen##func(void); \
+ asmlinkage void asm_exc_xen##func(void)
+
+#else /* !__ASSEMBLY__ */
+
+/*
+ * The ASM variants for DECLARE_IDTENTRY*() which emit the ASM entry stubs.
+ */
+#define DECLARE_IDTENTRY(vector, func) \
+ idtentry vector asm_##func func has_error_code=0
+
+#define DECLARE_IDTENTRY_ERRORCODE(vector, func) \
+ idtentry vector asm_##func func has_error_code=1
+
+/* Special case for 32bit IRET 'trap'. Do not emit ASM code */
+#define DECLARE_IDTENTRY_SW(vector, func)
+
+#define DECLARE_IDTENTRY_RAW(vector, func) \
+ DECLARE_IDTENTRY(vector, func)
+
+#define DECLARE_IDTENTRY_RAW_ERRORCODE(vector, func) \
+ DECLARE_IDTENTRY_ERRORCODE(vector, func)
+
+/* Entries for common/spurious (device) interrupts */
+#define DECLARE_IDTENTRY_IRQ(vector, func) \
+ idtentry_irq vector func
+
+/* System vector entries */
+#define DECLARE_IDTENTRY_SYSVEC(vector, func) \
+ idtentry_sysvec vector func
+
+#ifdef CONFIG_X86_64
+# define DECLARE_IDTENTRY_MCE(vector, func) \
+ idtentry_mce_db vector asm_##func func
+
+# define DECLARE_IDTENTRY_DEBUG(vector, func) \
+ idtentry_mce_db vector asm_##func func
+
+# define DECLARE_IDTENTRY_DF(vector, func) \
+ idtentry_df vector asm_##func func
+
+# define DECLARE_IDTENTRY_XENCB(vector, func) \
+ DECLARE_IDTENTRY(vector, func)
+
+#else
+# define DECLARE_IDTENTRY_MCE(vector, func) \
+ DECLARE_IDTENTRY(vector, func)
+
+# define DECLARE_IDTENTRY_DEBUG(vector, func) \
+ DECLARE_IDTENTRY(vector, func)
+
+/* No ASM emitted for DF as this goes through a C shim */
+# define DECLARE_IDTENTRY_DF(vector, func)
+
+/* No ASM emitted for XEN hypervisor callback */
+# define DECLARE_IDTENTRY_XENCB(vector, func)
+
+#endif
+
+/* No ASM code emitted for NMI */
+#define DECLARE_IDTENTRY_NMI(vector, func)
+
+/* XEN NMI and DB wrapper */
+#define DECLARE_IDTENTRY_XEN(vector, func) \
+ idtentry vector asm_exc_xen##func exc_##func has_error_code=0
+
+/*
+ * ASM code to emit the common vector entry stubs where each stub is
+ * packed into 8 bytes.
+ *
+ * Note, that the 'pushq imm8' is emitted via '.byte 0x6a, vector' because
+ * GCC treats the local vector variable as unsigned int and would expand
+ * all vectors above 0x7F to a 5 byte push. The original code did an
+ * adjustment of the vector number to be in the signed byte range to avoid
+ * this. While clever it's mindboggling counterintuitive and requires the
+ * odd conversion back to a real vector number in the C entry points. Using
+ * .byte achieves the same thing and the only fixup needed in the C entry
+ * point is to mask off the bits above bit 7 because the push is sign
+ * extending.
+ */
+ .align 8
+SYM_CODE_START(irq_entries_start)
+ vector=FIRST_EXTERNAL_VECTOR
+ pos = .
+ .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
+ UNWIND_HINT_IRET_REGS
+ .byte 0x6a, vector
+ jmp asm_common_interrupt
+ nop
+ /* Ensure that the above is 8 bytes max */
+ . = pos + 8
+ pos=pos+8
+ vector=vector+1
+ .endr
+SYM_CODE_END(irq_entries_start)
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ .align 8
+SYM_CODE_START(spurious_entries_start)
+ vector=FIRST_SYSTEM_VECTOR
+ pos = .
+ .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR)
+ UNWIND_HINT_IRET_REGS
+ .byte 0x6a, vector
+ jmp asm_spurious_interrupt
+ nop
+ /* Ensure that the above is 8 bytes max */
+ . = pos + 8
+ pos=pos+8
+ vector=vector+1
+ .endr
+SYM_CODE_END(spurious_entries_start)
+#endif
+
+#endif /* __ASSEMBLY__ */
+
+/*
+ * The actual entry points. Note that DECLARE_IDTENTRY*() serves two
+ * purposes:
+ * - provide the function declarations when included from C-Code
+ * - emit the ASM stubs when included from entry_32/64.S
+ *
+ * This avoids duplicate defines and ensures that everything is consistent.
+ */
+
+/*
+ * Dummy trap number so the low level ASM macro vector number checks do not
+ * match which results in emitting plain IDTENTRY stubs without bells and
+ * whistels.
+ */
+#define X86_TRAP_OTHER 0xFFFF
+
+/* Simple exception entry points. No hardware error code */
+DECLARE_IDTENTRY(X86_TRAP_DE, exc_divide_error);
+DECLARE_IDTENTRY(X86_TRAP_OF, exc_overflow);
+DECLARE_IDTENTRY(X86_TRAP_BR, exc_bounds);
+DECLARE_IDTENTRY(X86_TRAP_NM, exc_device_not_available);
+DECLARE_IDTENTRY(X86_TRAP_OLD_MF, exc_coproc_segment_overrun);
+DECLARE_IDTENTRY(X86_TRAP_SPURIOUS, exc_spurious_interrupt_bug);
+DECLARE_IDTENTRY(X86_TRAP_MF, exc_coprocessor_error);
+DECLARE_IDTENTRY(X86_TRAP_XF, exc_simd_coprocessor_error);
+
+/* 32bit software IRET trap. Do not emit ASM code */
+DECLARE_IDTENTRY_SW(X86_TRAP_IRET, iret_error);
+
+/* Simple exception entries with error code pushed by hardware */
+DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_TS, exc_invalid_tss);
+DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_NP, exc_segment_not_present);
+DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_SS, exc_stack_segment);
+DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_GP, exc_general_protection);
+DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_AC, exc_alignment_check);
+
+/* Raw exception entries which need extra work */
+DECLARE_IDTENTRY_RAW(X86_TRAP_UD, exc_invalid_op);
+DECLARE_IDTENTRY_RAW(X86_TRAP_BP, exc_int3);
+DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_PF, exc_page_fault);
+
+#ifdef CONFIG_X86_MCE
+DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check);
+#endif
+
+/* NMI */
+DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi);
+DECLARE_IDTENTRY_XEN(X86_TRAP_NMI, nmi);
+
+/* #DB */
+DECLARE_IDTENTRY_DEBUG(X86_TRAP_DB, exc_debug);
+DECLARE_IDTENTRY_XEN(X86_TRAP_DB, debug);
+
+/* #DF */
+DECLARE_IDTENTRY_DF(X86_TRAP_DF, exc_double_fault);
+
+#ifdef CONFIG_XEN_PV
+DECLARE_IDTENTRY_XENCB(X86_TRAP_OTHER, exc_xen_hypervisor_callback);
+#endif
+
+/* Device interrupts common/spurious */
+DECLARE_IDTENTRY_IRQ(X86_TRAP_OTHER, common_interrupt);
+#ifdef CONFIG_X86_LOCAL_APIC
+DECLARE_IDTENTRY_IRQ(X86_TRAP_OTHER, spurious_interrupt);
+#endif
+
+/* System vector entry points */
+#ifdef CONFIG_X86_LOCAL_APIC
+DECLARE_IDTENTRY_SYSVEC(ERROR_APIC_VECTOR, sysvec_error_interrupt);
+DECLARE_IDTENTRY_SYSVEC(SPURIOUS_APIC_VECTOR, sysvec_spurious_apic_interrupt);
+DECLARE_IDTENTRY_SYSVEC(LOCAL_TIMER_VECTOR, sysvec_apic_timer_interrupt);
+DECLARE_IDTENTRY_SYSVEC(X86_PLATFORM_IPI_VECTOR, sysvec_x86_platform_ipi);
+#endif
+
+#ifdef CONFIG_SMP
+DECLARE_IDTENTRY(RESCHEDULE_VECTOR, sysvec_reschedule_ipi);
+DECLARE_IDTENTRY_SYSVEC(IRQ_MOVE_CLEANUP_VECTOR, sysvec_irq_move_cleanup);
+DECLARE_IDTENTRY_SYSVEC(REBOOT_VECTOR, sysvec_reboot);
+DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_SINGLE_VECTOR, sysvec_call_function_single);
+DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_VECTOR, sysvec_call_function);
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+# ifdef CONFIG_X86_UV
+DECLARE_IDTENTRY_SYSVEC(UV_BAU_MESSAGE, sysvec_uv_bau_message);
+# endif
+
+# ifdef CONFIG_X86_MCE_THRESHOLD
+DECLARE_IDTENTRY_SYSVEC(THRESHOLD_APIC_VECTOR, sysvec_threshold);
+# endif
+
+# ifdef CONFIG_X86_MCE_AMD
+DECLARE_IDTENTRY_SYSVEC(DEFERRED_ERROR_VECTOR, sysvec_deferred_error);
+# endif
+
+# ifdef CONFIG_X86_THERMAL_VECTOR
+DECLARE_IDTENTRY_SYSVEC(THERMAL_APIC_VECTOR, sysvec_thermal);
+# endif
+
+# ifdef CONFIG_IRQ_WORK
+DECLARE_IDTENTRY_SYSVEC(IRQ_WORK_VECTOR, sysvec_irq_work);
+# endif
+#endif
+
+#ifdef CONFIG_HAVE_KVM
+DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_VECTOR, sysvec_kvm_posted_intr_ipi);
+DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_WAKEUP_VECTOR, sysvec_kvm_posted_intr_wakeup_ipi);
+DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested_ipi);
+#endif
+
+#if IS_ENABLED(CONFIG_HYPERV)
+DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback);
+DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment);
+DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_STIMER0_VECTOR, sysvec_hyperv_stimer0);
+#endif
+
+#if IS_ENABLED(CONFIG_ACRN_GUEST)
+DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_acrn_hv_callback);
+#endif
+
+#ifdef CONFIG_XEN_PVHVM
+DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_xen_hvm_callback);
+#endif
+
+#undef X86_TRAP_OTHER
+
+#endif
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 72fba0e..528c8a7 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -11,6 +11,13 @@
#include <asm/apicdef.h>
#include <asm/irq_vectors.h>
+/*
+ * The irq entry code is in the noinstr section and the start/end of
+ * __irqentry_text is emitted via labels. Make the build fail if
+ * something moves a C function into the __irq_entry section.
+ */
+#define __irq_entry __invalid_section
+
static inline int irq_canonicalize(int irq)
{
return ((irq == 2) ? 9 : irq);
@@ -26,17 +33,14 @@
#ifdef CONFIG_HAVE_KVM
extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void));
-extern __visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs);
-extern __visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs);
-extern __visible void smp_kvm_posted_intr_nested_ipi(struct pt_regs *regs);
#endif
extern void (*x86_platform_ipi_callback)(void);
extern void native_init_IRQ(void);
-extern void handle_irq(struct irq_desc *desc, struct pt_regs *regs);
+extern void __handle_irq(struct irq_desc *desc, struct pt_regs *regs);
-extern __visible void do_IRQ(struct pt_regs *regs);
+extern __visible void do_IRQ(struct pt_regs *regs, unsigned long vector);
extern void init_ISA_irqs(void);
@@ -46,7 +50,6 @@
void arch_trigger_cpumask_backtrace(const struct cpumask *mask,
bool exclude_self);
-extern __visible void smp_x86_platform_ipi(struct pt_regs *regs);
#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace
#endif
diff --git a/arch/x86/include/asm/irq_regs.h b/arch/x86/include/asm/irq_regs.h
deleted file mode 100644
index 187ce59..0000000
--- a/arch/x86/include/asm/irq_regs.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Per-cpu current frame pointer - the location of the last exception frame on
- * the stack, stored in the per-cpu area.
- *
- * Jeremy Fitzhardinge <jeremy@goop.org>
- */
-#ifndef _ASM_X86_IRQ_REGS_H
-#define _ASM_X86_IRQ_REGS_H
-
-#include <asm/percpu.h>
-
-#define ARCH_HAS_OWN_IRQ_REGS
-
-DECLARE_PER_CPU(struct pt_regs *, irq_regs);
-
-static inline struct pt_regs *get_irq_regs(void)
-{
- return __this_cpu_read(irq_regs);
-}
-
-static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
-{
- struct pt_regs *old_regs;
-
- old_regs = get_irq_regs();
- __this_cpu_write(irq_regs, new_regs);
-
- return old_regs;
-}
-
-#endif /* _ASM_X86_IRQ_REGS_32_H */
diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h
new file mode 100644
index 0000000..4ae66f0
--- /dev/null
+++ b/arch/x86/include/asm/irq_stack.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_IRQ_STACK_H
+#define _ASM_X86_IRQ_STACK_H
+
+#include <linux/ptrace.h>
+
+#include <asm/processor.h>
+
+#ifdef CONFIG_X86_64
+static __always_inline bool irqstack_active(void)
+{
+ return __this_cpu_read(irq_count) != -1;
+}
+
+void asm_call_on_stack(void *sp, void *func, void *arg);
+
+static __always_inline void __run_on_irqstack(void *func, void *arg)
+{
+ void *tos = __this_cpu_read(hardirq_stack_ptr);
+
+ __this_cpu_add(irq_count, 1);
+ asm_call_on_stack(tos - 8, func, arg);
+ __this_cpu_sub(irq_count, 1);
+}
+
+#else /* CONFIG_X86_64 */
+static inline bool irqstack_active(void) { return false; }
+static inline void __run_on_irqstack(void *func, void *arg) { }
+#endif /* !CONFIG_X86_64 */
+
+static __always_inline bool irq_needs_irq_stack(struct pt_regs *regs)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ return false;
+ if (!regs)
+ return !irqstack_active();
+ return !user_mode(regs) && !irqstack_active();
+}
+
+static __always_inline void run_on_irqstack_cond(void *func, void *arg,
+ struct pt_regs *regs)
+{
+ void (*__func)(void *arg) = func;
+
+ lockdep_assert_irqs_disabled();
+
+ if (irq_needs_irq_stack(regs))
+ __run_on_irqstack(__func, arg);
+ else
+ __func(arg);
+}
+
+#endif
diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h
index 80b35e3..800ffce 100644
--- a/arch/x86/include/asm/irq_work.h
+++ b/arch/x86/include/asm/irq_work.h
@@ -10,7 +10,6 @@
return boot_cpu_has(X86_FEATURE_APIC);
}
extern void arch_irq_work_raise(void);
-extern __visible void smp_irq_work_interrupt(struct pt_regs *regs);
#else
static inline bool arch_irq_work_has_interrupt(void)
{
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 8a0e56e..02a0cf5 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -17,7 +17,7 @@
/* Declaration required for gcc < 4.9 to prevent -Werror=missing-prototypes */
extern inline unsigned long native_save_fl(void);
-extern inline unsigned long native_save_fl(void)
+extern __always_inline unsigned long native_save_fl(void)
{
unsigned long flags;
@@ -44,12 +44,12 @@
:"memory", "cc");
}
-static inline void native_irq_disable(void)
+static __always_inline void native_irq_disable(void)
{
asm volatile("cli": : :"memory");
}
-static inline void native_irq_enable(void)
+static __always_inline void native_irq_enable(void)
{
asm volatile("sti": : :"memory");
}
@@ -74,22 +74,22 @@
#ifndef __ASSEMBLY__
#include <linux/types.h>
-static inline notrace unsigned long arch_local_save_flags(void)
+static __always_inline unsigned long arch_local_save_flags(void)
{
return native_save_fl();
}
-static inline notrace void arch_local_irq_restore(unsigned long flags)
+static __always_inline void arch_local_irq_restore(unsigned long flags)
{
native_restore_fl(flags);
}
-static inline notrace void arch_local_irq_disable(void)
+static __always_inline void arch_local_irq_disable(void)
{
native_irq_disable();
}
-static inline notrace void arch_local_irq_enable(void)
+static __always_inline void arch_local_irq_enable(void)
{
native_irq_enable();
}
@@ -115,7 +115,7 @@
/*
* For spinlocks, etc:
*/
-static inline notrace unsigned long arch_local_irq_save(void)
+static __always_inline unsigned long arch_local_irq_save(void)
{
unsigned long flags = arch_local_save_flags();
arch_local_irq_disable();
@@ -159,12 +159,12 @@
#endif /* CONFIG_PARAVIRT_XXL */
#ifndef __ASSEMBLY__
-static inline int arch_irqs_disabled_flags(unsigned long flags)
+static __always_inline int arch_irqs_disabled_flags(unsigned long flags)
{
return !(flags & X86_EFLAGS_IF);
}
-static inline int arch_irqs_disabled(void)
+static __always_inline int arch_irqs_disabled(void)
{
unsigned long flags = arch_local_save_flags();
@@ -172,38 +172,4 @@
}
#endif /* !__ASSEMBLY__ */
-#ifdef __ASSEMBLY__
-#ifdef CONFIG_TRACE_IRQFLAGS
-# define TRACE_IRQS_ON call trace_hardirqs_on_thunk;
-# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk;
-#else
-# define TRACE_IRQS_ON
-# define TRACE_IRQS_OFF
-#endif
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# ifdef CONFIG_X86_64
-# define LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
-# define LOCKDEP_SYS_EXIT_IRQ \
- TRACE_IRQS_ON; \
- sti; \
- call lockdep_sys_exit_thunk; \
- cli; \
- TRACE_IRQS_OFF;
-# else
-# define LOCKDEP_SYS_EXIT \
- pushl %eax; \
- pushl %ecx; \
- pushl %edx; \
- call lockdep_sys_exit; \
- popl %edx; \
- popl %ecx; \
- popl %eax;
-# define LOCKDEP_SYS_EXIT_IRQ
-# endif
-#else
-# define LOCKDEP_SYS_EXIT
-# define LOCKDEP_SYS_EXIT_IRQ
-#endif
-#endif /* __ASSEMBLY__ */
-
#endif
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 57fd196..49d3a9e 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -141,7 +141,7 @@
return;
}
-static inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token)
+static __always_inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token)
{
return false;
}
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index f9cea08..cf50382 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -127,6 +127,17 @@
#define MSR_AMD64_SMCA_MCx_DEADDR(x) (MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))
+#define XEC(x, mask) (((x) >> 16) & mask)
+
+/* mce.kflags flag bits for logging etc. */
+#define MCE_HANDLED_CEC BIT_ULL(0)
+#define MCE_HANDLED_UC BIT_ULL(1)
+#define MCE_HANDLED_EXTLOG BIT_ULL(2)
+#define MCE_HANDLED_NFIT BIT_ULL(3)
+#define MCE_HANDLED_EDAC BIT_ULL(4)
+#define MCE_HANDLED_MCELOG BIT_ULL(5)
+#define MCE_IN_KERNEL_RECOV BIT_ULL(6)
+
/*
* This structure contains all data related to the MCE log. Also
* carries a signature to make it easier to find from external
@@ -142,14 +153,16 @@
struct mce entry[];
};
+/* Highest last */
enum mce_notifier_prios {
- MCE_PRIO_FIRST = INT_MAX,
- MCE_PRIO_UC = INT_MAX - 1,
- MCE_PRIO_EXTLOG = INT_MAX - 2,
- MCE_PRIO_NFIT = INT_MAX - 3,
- MCE_PRIO_EDAC = INT_MAX - 4,
- MCE_PRIO_MCELOG = 1,
- MCE_PRIO_LOWEST = 0,
+ MCE_PRIO_LOWEST,
+ MCE_PRIO_MCELOG,
+ MCE_PRIO_EDAC,
+ MCE_PRIO_NFIT,
+ MCE_PRIO_EXTLOG,
+ MCE_PRIO_UC,
+ MCE_PRIO_EARLY,
+ MCE_PRIO_CEC
};
struct notifier_block;
@@ -238,7 +251,7 @@
/*
* Exception handler
*/
-void do_machine_check(struct pt_regs *, long);
+void do_machine_check(struct pt_regs *pt_regs);
/*
* Threshold handler
@@ -347,5 +360,4 @@
#endif
static inline void mce_hygon_feature_init(struct cpuinfo_x86 *c) { return mce_amd_feature_init(c); }
-
#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index d30805e..60b944d 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -54,20 +54,8 @@
vclocks_set_used(VDSO_CLOCKMODE_HVCLOCK);
#define hv_get_raw_timer() rdtsc_ordered()
-void hyperv_callback_vector(void);
-void hyperv_reenlightenment_vector(void);
-#ifdef CONFIG_TRACING
-#define trace_hyperv_callback_vector hyperv_callback_vector
-#endif
void hyperv_vector_handler(struct pt_regs *regs);
-/*
- * Routines for stimer0 Direct Mode handling.
- * On x86/x64, there are no percpu actions to take.
- */
-void hv_stimer0_vector_handler(struct pt_regs *regs);
-void hv_stimer0_callback_vector(void);
-
static inline void hv_enable_stimer0_percpu_irq(int irq) {}
static inline void hv_disable_stimer0_percpu_irq(int irq) {}
@@ -226,7 +214,6 @@
void *hv_alloc_hyperv_page(void);
void *hv_alloc_hyperv_zeroed_page(void);
void hv_free_hyperv_page(unsigned long addr);
-void hyperv_reenlightenment_intr(struct pt_regs *regs);
void set_hv_tscchange_cb(void (*cb)(void));
void clear_hv_tscchange_cb(void);
void hyperv_stop_tsc_emulation(void);
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index d52d1aa..e7752b4 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -262,7 +262,7 @@
* combination with microcode which triggers a CPU buffer flush when the
* instruction is executed.
*/
-static inline void mds_clear_cpu_buffers(void)
+static __always_inline void mds_clear_cpu_buffers(void)
{
static const u16 ds = __KERNEL_DS;
@@ -283,7 +283,7 @@
*
* Clear CPU buffers if the corresponding static key is enabled
*/
-static inline void mds_user_clear_cpu_buffers(void)
+static __always_inline void mds_user_clear_cpu_buffers(void)
{
if (static_branch_likely(&mds_user_clear))
mds_clear_cpu_buffers();
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 29ee0c0..42cd333 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -823,7 +823,7 @@
* Useful for spinlocks to avoid one state transition in the
* cache coherency protocol:
*/
-static inline void prefetchw(const void *x)
+static __always_inline void prefetchw(const void *x)
{
alternative_input(BASE_PREFETCH, "prefetchw %P1",
X86_FEATURE_3DNOWPREFETCH,
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 6d6475f..ebedeab 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -123,7 +123,7 @@
* On x86_64, vm86 mode is mercifully nonexistent, and we don't need
* the extra check.
*/
-static inline int user_mode(struct pt_regs *regs)
+static __always_inline int user_mode(struct pt_regs *regs)
{
#ifdef CONFIG_X86_32
return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >= USER_RPL;
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index ec2c0a0..5948218 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -86,28 +86,35 @@
extern int kernel_set_to_readonly;
#ifdef CONFIG_X86_64
-static inline int set_mce_nospec(unsigned long pfn)
+/*
+ * Prevent speculative access to the page by either unmapping
+ * it (if we do not require access to any part of the page) or
+ * marking it uncacheable (if we want to try to retrieve data
+ * from non-poisoned lines in the page).
+ */
+static inline int set_mce_nospec(unsigned long pfn, bool unmap)
{
unsigned long decoy_addr;
int rc;
/*
- * Mark the linear address as UC to make sure we don't log more
- * errors because of speculative access to the page.
* We would like to just call:
- * set_memory_uc((unsigned long)pfn_to_kaddr(pfn), 1);
+ * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1);
* but doing that would radically increase the odds of a
* speculative access to the poison page because we'd have
* the virtual address of the kernel 1:1 mapping sitting
* around in registers.
* Instead we get tricky. We create a non-canonical address
* that looks just like the one we want, but has bit 63 flipped.
- * This relies on set_memory_uc() properly sanitizing any __pa()
+ * This relies on set_memory_XX() properly sanitizing any __pa()
* results with __PHYSICAL_MASK or PTE_PFN_MASK.
*/
decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
- rc = set_memory_uc(decoy_addr, 1);
+ if (unmap)
+ rc = set_memory_np(decoy_addr, 1);
+ else
+ rc = set_memory_uc(decoy_addr, 1);
if (rc)
pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
return rc;
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 6d37b8f..eb8e781 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -7,6 +7,7 @@
#include <asm/nops.h>
#include <asm/processor-flags.h>
+#include <linux/irqflags.h>
#include <linux/jump_label.h>
/*
@@ -27,14 +28,14 @@
return val;
}
-static inline unsigned long native_read_cr2(void)
+static __always_inline unsigned long native_read_cr2(void)
{
unsigned long val;
asm volatile("mov %%cr2,%0\n\t" : "=r" (val), "=m" (__force_order));
return val;
}
-static inline void native_write_cr2(unsigned long val)
+static __always_inline void native_write_cr2(unsigned long val)
{
asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order));
}
@@ -129,7 +130,16 @@
asm volatile("wbinvd": : :"memory");
}
-extern asmlinkage void native_load_gs_index(unsigned);
+extern asmlinkage void asm_load_gs_index(unsigned int selector);
+
+static inline void native_load_gs_index(unsigned int selector)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ asm_load_gs_index(selector);
+ local_irq_restore(flags);
+}
static inline unsigned long __read_cr4(void)
{
@@ -150,12 +160,12 @@
native_write_cr0(x);
}
-static inline unsigned long read_cr2(void)
+static __always_inline unsigned long read_cr2(void)
{
return native_read_cr2();
}
-static inline void write_cr2(unsigned long x)
+static __always_inline void write_cr2(unsigned long x)
{
native_write_cr2(x);
}
@@ -186,7 +196,7 @@
#ifdef CONFIG_X86_64
-static inline void load_gs_index(unsigned selector)
+static inline void load_gs_index(unsigned int selector)
{
native_load_gs_index(selector);
}
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index 67315fa..6593b42 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -64,7 +64,7 @@
#define DISP32_SIZE 4
-static inline int text_opcode_size(u8 opcode)
+static __always_inline int text_opcode_size(u8 opcode)
{
int size = 0;
@@ -118,12 +118,14 @@
extern __ro_after_init unsigned long poking_addr;
#ifndef CONFIG_UML_X86
-static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip)
+static __always_inline
+void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip)
{
regs->ip = ip;
}
-static inline void int3_emulate_push(struct pt_regs *regs, unsigned long val)
+static __always_inline
+void int3_emulate_push(struct pt_regs *regs, unsigned long val)
{
/*
* The int3 handler in entry_64.S adds a gap between the
@@ -138,7 +140,8 @@
*(unsigned long *)regs->sp = val;
}
-static inline void int3_emulate_call(struct pt_regs *regs, unsigned long func)
+static __always_inline
+void int3_emulate_call(struct pt_regs *regs, unsigned long func)
{
int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + CALL_INSN_SIZE);
int3_emulate_jmp(regs, func);
diff --git a/arch/x86/include/asm/trace/common.h b/arch/x86/include/asm/trace/common.h
index 57c8da02..f0f9bcd 100644
--- a/arch/x86/include/asm/trace/common.h
+++ b/arch/x86/include/asm/trace/common.h
@@ -5,12 +5,8 @@
DECLARE_STATIC_KEY_FALSE(trace_pagefault_key);
#define trace_pagefault_enabled() \
static_branch_unlikely(&trace_pagefault_key)
-DECLARE_STATIC_KEY_FALSE(trace_resched_ipi_key);
-#define trace_resched_ipi_enabled() \
- static_branch_unlikely(&trace_resched_ipi_key)
#else
static inline bool trace_pagefault_enabled(void) { return false; }
-static inline bool trace_resched_ipi_enabled(void) { return false; }
#endif
#endif
diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h
index 33b9d0f..88e7f0f 100644
--- a/arch/x86/include/asm/trace/irq_vectors.h
+++ b/arch/x86/include/asm/trace/irq_vectors.h
@@ -10,9 +10,6 @@
#ifdef CONFIG_X86_LOCAL_APIC
-extern int trace_resched_ipi_reg(void);
-extern void trace_resched_ipi_unreg(void);
-
DECLARE_EVENT_CLASS(x86_irq_vector,
TP_PROTO(int vector),
@@ -37,18 +34,6 @@
TP_PROTO(int vector), \
TP_ARGS(vector), NULL, NULL);
-#define DEFINE_RESCHED_IPI_EVENT(name) \
-DEFINE_EVENT_FN(x86_irq_vector, name##_entry, \
- TP_PROTO(int vector), \
- TP_ARGS(vector), \
- trace_resched_ipi_reg, \
- trace_resched_ipi_unreg); \
-DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \
- TP_PROTO(int vector), \
- TP_ARGS(vector), \
- trace_resched_ipi_reg, \
- trace_resched_ipi_unreg);
-
/*
* local_timer - called when entering/exiting a local timer interrupt
* vector handler
@@ -99,7 +84,7 @@
/*
* reschedule - called when entering/exiting a reschedule vector handler
*/
-DEFINE_RESCHED_IPI_EVENT(reschedule);
+DEFINE_IRQ_VECTOR_EVENT(reschedule);
/*
* call_function - called when entering/exiting a call function interrupt
diff --git a/arch/x86/include/asm/trapnr.h b/arch/x86/include/asm/trapnr.h
new file mode 100644
index 0000000..082f456
--- /dev/null
+++ b/arch/x86/include/asm/trapnr.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_TRAPNR_H
+#define _ASM_X86_TRAPNR_H
+
+/* Interrupts/Exceptions */
+
+#define X86_TRAP_DE 0 /* Divide-by-zero */
+#define X86_TRAP_DB 1 /* Debug */
+#define X86_TRAP_NMI 2 /* Non-maskable Interrupt */
+#define X86_TRAP_BP 3 /* Breakpoint */
+#define X86_TRAP_OF 4 /* Overflow */
+#define X86_TRAP_BR 5 /* Bound Range Exceeded */
+#define X86_TRAP_UD 6 /* Invalid Opcode */
+#define X86_TRAP_NM 7 /* Device Not Available */
+#define X86_TRAP_DF 8 /* Double Fault */
+#define X86_TRAP_OLD_MF 9 /* Coprocessor Segment Overrun */
+#define X86_TRAP_TS 10 /* Invalid TSS */
+#define X86_TRAP_NP 11 /* Segment Not Present */
+#define X86_TRAP_SS 12 /* Stack Segment Fault */
+#define X86_TRAP_GP 13 /* General Protection Fault */
+#define X86_TRAP_PF 14 /* Page Fault */
+#define X86_TRAP_SPURIOUS 15 /* Spurious Interrupt */
+#define X86_TRAP_MF 16 /* x87 Floating-Point Exception */
+#define X86_TRAP_AC 17 /* Alignment Check */
+#define X86_TRAP_MC 18 /* Machine Check */
+#define X86_TRAP_XF 19 /* SIMD Floating-Point Exception */
+#define X86_TRAP_VE 20 /* Virtualization Exception */
+#define X86_TRAP_CP 21 /* Control Protection Exception */
+#define X86_TRAP_IRET 32 /* IRET Exception */
+
+#endif
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 2ae904b..714b1a3 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -6,85 +6,9 @@
#include <linux/kprobes.h>
#include <asm/debugreg.h>
+#include <asm/idtentry.h>
#include <asm/siginfo.h> /* TRAP_TRACE, ... */
-#define dotraplinkage __visible
-
-asmlinkage void divide_error(void);
-asmlinkage void debug(void);
-asmlinkage void nmi(void);
-asmlinkage void int3(void);
-asmlinkage void overflow(void);
-asmlinkage void bounds(void);
-asmlinkage void invalid_op(void);
-asmlinkage void device_not_available(void);
-#ifdef CONFIG_X86_64
-asmlinkage void double_fault(void);
-#endif
-asmlinkage void coprocessor_segment_overrun(void);
-asmlinkage void invalid_TSS(void);
-asmlinkage void segment_not_present(void);
-asmlinkage void stack_segment(void);
-asmlinkage void general_protection(void);
-asmlinkage void page_fault(void);
-asmlinkage void async_page_fault(void);
-asmlinkage void spurious_interrupt_bug(void);
-asmlinkage void coprocessor_error(void);
-asmlinkage void alignment_check(void);
-#ifdef CONFIG_X86_MCE
-asmlinkage void machine_check(void);
-#endif /* CONFIG_X86_MCE */
-asmlinkage void simd_coprocessor_error(void);
-
-#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV)
-asmlinkage void xen_divide_error(void);
-asmlinkage void xen_xennmi(void);
-asmlinkage void xen_xendebug(void);
-asmlinkage void xen_int3(void);
-asmlinkage void xen_overflow(void);
-asmlinkage void xen_bounds(void);
-asmlinkage void xen_invalid_op(void);
-asmlinkage void xen_device_not_available(void);
-asmlinkage void xen_double_fault(void);
-asmlinkage void xen_coprocessor_segment_overrun(void);
-asmlinkage void xen_invalid_TSS(void);
-asmlinkage void xen_segment_not_present(void);
-asmlinkage void xen_stack_segment(void);
-asmlinkage void xen_general_protection(void);
-asmlinkage void xen_page_fault(void);
-asmlinkage void xen_spurious_interrupt_bug(void);
-asmlinkage void xen_coprocessor_error(void);
-asmlinkage void xen_alignment_check(void);
-#ifdef CONFIG_X86_MCE
-asmlinkage void xen_machine_check(void);
-#endif /* CONFIG_X86_MCE */
-asmlinkage void xen_simd_coprocessor_error(void);
-#endif
-
-dotraplinkage void do_divide_error(struct pt_regs *regs, long error_code);
-dotraplinkage void do_debug(struct pt_regs *regs, long error_code);
-dotraplinkage void do_nmi(struct pt_regs *regs, long error_code);
-dotraplinkage void do_int3(struct pt_regs *regs, long error_code);
-dotraplinkage void do_overflow(struct pt_regs *regs, long error_code);
-dotraplinkage void do_bounds(struct pt_regs *regs, long error_code);
-dotraplinkage void do_invalid_op(struct pt_regs *regs, long error_code);
-dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code);
-dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2);
-dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *regs, long error_code);
-dotraplinkage void do_invalid_TSS(struct pt_regs *regs, long error_code);
-dotraplinkage void do_segment_not_present(struct pt_regs *regs, long error_code);
-dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code);
-dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code);
-dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address);
-dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code);
-dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code);
-dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code);
-dotraplinkage void do_simd_coprocessor_error(struct pt_regs *regs, long error_code);
-#ifdef CONFIG_X86_32
-dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code);
-#endif
-dotraplinkage void do_mce(struct pt_regs *regs, long error_code);
-
#ifdef CONFIG_X86_64
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs);
asmlinkage __visible notrace
@@ -92,6 +16,11 @@
void __init trap_init(void);
#endif
+#ifdef CONFIG_X86_F00F_BUG
+/* For handling the FOOF bug */
+void handle_invalid_op(struct pt_regs *regs);
+#endif
+
static inline int get_si_code(unsigned long condition)
{
if (condition & DR_STEP)
@@ -105,16 +34,6 @@
extern int panic_on_unrecovered_nmi;
void math_emulate(struct math_emu_info *);
-#ifndef CONFIG_X86_32
-asmlinkage void smp_thermal_interrupt(struct pt_regs *regs);
-asmlinkage void smp_threshold_interrupt(struct pt_regs *regs);
-asmlinkage void smp_deferred_error_interrupt(struct pt_regs *regs);
-#endif
-
-void smp_apic_timer_interrupt(struct pt_regs *regs);
-void smp_spurious_interrupt(struct pt_regs *regs);
-void smp_error_interrupt(struct pt_regs *regs);
-asmlinkage void smp_irq_move_cleanup_interrupt(void);
#ifdef CONFIG_VMAP_STACK
void __noreturn handle_stack_overflow(const char *message,
@@ -122,31 +41,6 @@
unsigned long fault_address);
#endif
-/* Interrupts/Exceptions */
-enum {
- X86_TRAP_DE = 0, /* 0, Divide-by-zero */
- X86_TRAP_DB, /* 1, Debug */
- X86_TRAP_NMI, /* 2, Non-maskable Interrupt */
- X86_TRAP_BP, /* 3, Breakpoint */
- X86_TRAP_OF, /* 4, Overflow */
- X86_TRAP_BR, /* 5, Bound Range Exceeded */
- X86_TRAP_UD, /* 6, Invalid Opcode */
- X86_TRAP_NM, /* 7, Device Not Available */
- X86_TRAP_DF, /* 8, Double Fault */
- X86_TRAP_OLD_MF, /* 9, Coprocessor Segment Overrun */
- X86_TRAP_TS, /* 10, Invalid TSS */
- X86_TRAP_NP, /* 11, Segment Not Present */
- X86_TRAP_SS, /* 12, Stack Segment Fault */
- X86_TRAP_GP, /* 13, General Protection Fault */
- X86_TRAP_PF, /* 14, Page Fault */
- X86_TRAP_SPURIOUS, /* 15, Spurious Interrupt */
- X86_TRAP_MF, /* 16, x87 Floating-Point Exception */
- X86_TRAP_AC, /* 17, Alignment Check */
- X86_TRAP_MC, /* 18, Machine Check */
- X86_TRAP_XF, /* 19, SIMD Floating-Point Exception */
- X86_TRAP_IRET = 32, /* 32, IRET Exception */
-};
-
/*
* Page fault error code bits:
*
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 13687bf..f1188bd 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -12,6 +12,8 @@
#define _ASM_X86_UV_UV_BAU_H
#include <linux/bitmap.h>
+#include <asm/idtentry.h>
+
#define BITSPERBYTE 8
/*
@@ -799,12 +801,6 @@
bitmap_zero(&dstp->bits, nbits);
}
-extern void uv_bau_message_intr1(void);
-#ifdef CONFIG_TRACING
-#define trace_uv_bau_message_intr1 uv_bau_message_intr1
-#endif
-extern void uv_bau_timeout_intr1(void);
-
struct atomic_short {
short counter;
};
diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h
index 955c2a2..db9adc0 100644
--- a/arch/x86/include/uapi/asm/mce.h
+++ b/arch/x86/include/uapi/asm/mce.h
@@ -35,6 +35,7 @@
__u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */
__u64 ppin; /* Protected Processor Inventory Number */
__u32 microcode; /* Microcode revision */
+ __u64 kflags; /* Internal kernel use */
};
#define MCE_GET_RECORD_LEN _IOR('M', 1, int)
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index a9195ce..8fd39ff 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1011,28 +1011,29 @@
static struct bp_patching_desc *bp_desc;
-static inline struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp)
+static __always_inline
+struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp)
{
- struct bp_patching_desc *desc = READ_ONCE(*descp); /* rcu_dereference */
+ struct bp_patching_desc *desc = __READ_ONCE(*descp); /* rcu_dereference */
- if (!desc || !atomic_inc_not_zero(&desc->refs))
+ if (!desc || !arch_atomic_inc_not_zero(&desc->refs))
return NULL;
return desc;
}
-static inline void put_desc(struct bp_patching_desc *desc)
+static __always_inline void put_desc(struct bp_patching_desc *desc)
{
smp_mb__before_atomic();
- atomic_dec(&desc->refs);
+ arch_atomic_dec(&desc->refs);
}
-static inline void *text_poke_addr(struct text_poke_loc *tp)
+static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
{
return _stext + tp->rel_addr;
}
-static int notrace patch_cmp(const void *key, const void *elt)
+static __always_inline int patch_cmp(const void *key, const void *elt)
{
struct text_poke_loc *tp = (struct text_poke_loc *) elt;
@@ -1042,9 +1043,8 @@
return 1;
return 0;
}
-NOKPROBE_SYMBOL(patch_cmp);
-int notrace poke_int3_handler(struct pt_regs *regs)
+int noinstr poke_int3_handler(struct pt_regs *regs)
{
struct bp_patching_desc *desc;
struct text_poke_loc *tp;
@@ -1077,9 +1077,9 @@
* Skip the binary search if there is a single member in the vector.
*/
if (unlikely(desc->nr_entries > 1)) {
- tp = bsearch(ip, desc->vec, desc->nr_entries,
- sizeof(struct text_poke_loc),
- patch_cmp);
+ tp = __inline_bsearch(ip, desc->vec, desc->nr_entries,
+ sizeof(struct text_poke_loc),
+ patch_cmp);
if (!tp)
goto out_put;
} else {
@@ -1118,7 +1118,6 @@
put_desc(desc);
return ret;
}
-NOKPROBE_SYMBOL(poke_int3_handler);
#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
static struct text_poke_loc tp_vec[TP_VEC_MAX];
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index b6b3297..18f6b7c 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -18,9 +18,11 @@
#define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450
#define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0
#define PCI_DEVICE_ID_AMD_17H_M30H_ROOT 0x1480
+#define PCI_DEVICE_ID_AMD_17H_M60H_ROOT 0x1630
#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464
#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec
#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494
+#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c
#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444
#define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654
@@ -33,6 +35,7 @@
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_ROOT) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_ROOT) },
{}
};
@@ -50,6 +53,7 @@
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) },
@@ -65,6 +69,7 @@
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index bf4acb0..e0e2f02 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1088,23 +1088,14 @@
* [ if a single-CPU system runs an SMP kernel then we call the local
* interrupt as well. Thus we cannot inline the local irq ... ]
*/
-__visible void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_apic_timer_interrupt)
{
struct pt_regs *old_regs = set_irq_regs(regs);
- /*
- * NOTE! We'd better ACK the irq immediately,
- * because timer handling can be slow.
- *
- * update_process_times() expects us to have done irq_enter().
- * Besides, if we don't timer interrupts ignore the global
- * interrupt lock, which is the WrongThing (tm) to do.
- */
- entering_ack_irq();
+ ack_APIC_irq();
trace_local_timer_entry(LOCAL_TIMER_VECTOR);
local_apic_timer_interrupt();
trace_local_timer_exit(LOCAL_TIMER_VECTOR);
- exiting_irq();
set_irq_regs(old_regs);
}
@@ -2120,15 +2111,21 @@
* Local APIC interrupts
*/
-/*
- * This interrupt should _never_ happen with our APIC/SMP architecture
+/**
+ * spurious_interrupt - Catch all for interrupts raised on unused vectors
+ * @regs: Pointer to pt_regs on stack
+ * @vector: The vector number
+ *
+ * This is invoked from ASM entry code to catch all interrupts which
+ * trigger on an entry which is routed to the common_spurious idtentry
+ * point.
+ *
+ * Also called from sysvec_spurious_apic_interrupt().
*/
-__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_IRQ(spurious_interrupt)
{
- u8 vector = ~regs->orig_ax;
u32 v;
- entering_irq();
trace_spurious_apic_entry(vector);
inc_irq_stat(irq_spurious_count);
@@ -2158,13 +2155,17 @@
}
out:
trace_spurious_apic_exit(vector);
- exiting_irq();
+}
+
+DEFINE_IDTENTRY_SYSVEC(sysvec_spurious_apic_interrupt)
+{
+ __spurious_interrupt(regs, SPURIOUS_APIC_VECTOR);
}
/*
* This interrupt should never happen with our APIC/SMP architecture
*/
-__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_error_interrupt)
{
static const char * const error_interrupt_reason[] = {
"Send CS error", /* APIC Error Bit 0 */
@@ -2178,7 +2179,6 @@
};
u32 v, i = 0;
- entering_irq();
trace_error_apic_entry(ERROR_APIC_VECTOR);
/* First tickle the hardware, only then report what went on. -- REW */
@@ -2202,7 +2202,6 @@
apic_printk(APIC_DEBUG, KERN_CONT "\n");
trace_error_apic_exit(ERROR_APIC_VECTOR);
- exiting_irq();
}
/**
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 159bd0c..5cbaca5 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -115,7 +115,8 @@
* denote it as spurious which is no harm as this is a rare event
* and interrupt handlers have to cope with spurious interrupts
* anyway. If the vector is unused, then it is marked so it won't
- * trigger the 'No irq handler for vector' warning in do_IRQ().
+ * trigger the 'No irq handler for vector' warning in
+ * common_interrupt().
*
* This requires to hold vector lock to prevent concurrent updates to
* the affected vector.
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 67768e5443..c48be6e 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -861,13 +861,13 @@
apicd->move_in_progress = 0;
}
-asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void)
+DEFINE_IDTENTRY_SYSVEC(sysvec_irq_move_cleanup)
{
struct hlist_head *clhead = this_cpu_ptr(&cleanup_list);
struct apic_chip_data *apicd;
struct hlist_node *tmp;
- entering_ack_irq();
+ ack_APIC_irq();
/* Prevent vectors vanishing under us */
raw_spin_lock(&vector_lock);
@@ -892,7 +892,6 @@
}
raw_spin_unlock(&vector_lock);
- exiting_irq();
}
static void __send_cleanup_vector(struct apic_chip_data *apicd)
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index c2a47016..828be79 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -57,9 +57,6 @@
BLANK();
#undef ENTRY
- OFFSET(TSS_ist, tss_struct, x86_tss.ist);
- DEFINE(DB_STACK_OFFSET, offsetof(struct cea_exception_stacks, DB_stack) -
- offsetof(struct cea_exception_stacks, DB1_stack));
BLANK();
#ifdef CONFIG_STACKPROTECTOR
diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c
index 676022e..1da9b1c 100644
--- a/arch/x86/kernel/cpu/acrn.c
+++ b/arch/x86/kernel/cpu/acrn.c
@@ -10,10 +10,10 @@
*/
#include <linux/interrupt.h>
-#include <asm/acrn.h>
#include <asm/apic.h>
#include <asm/desc.h>
#include <asm/hypervisor.h>
+#include <asm/idtentry.h>
#include <asm/irq_regs.h>
static uint32_t __init acrn_detect(void)
@@ -24,7 +24,7 @@
static void __init acrn_init_platform(void)
{
/* Setup the IDT for ACRN hypervisor callback */
- alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, acrn_hv_callback_vector);
+ alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_acrn_hv_callback);
}
static bool acrn_x2apic_available(void)
@@ -39,7 +39,7 @@
static void (*acrn_intr_handler)(void);
-__visible void __irq_entry acrn_hv_vector_handler(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_acrn_hv_callback)
{
struct pt_regs *old_regs = set_irq_regs(regs);
@@ -50,13 +50,12 @@
* will block the interrupt whose vector is lower than
* HYPERVISOR_CALLBACK_VECTOR.
*/
- entering_ack_irq();
+ ack_APIC_irq();
inc_irq_stat(irq_hv_callback_count);
if (acrn_intr_handler)
acrn_intr_handler();
- exiting_irq();
set_irq_regs(old_regs);
}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 8be042d..043d93c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1706,25 +1706,6 @@
X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);
}
-DEFINE_PER_CPU(int, debug_stack_usage);
-DEFINE_PER_CPU(u32, debug_idt_ctr);
-
-void debug_stack_set_zero(void)
-{
- this_cpu_inc(debug_idt_ctr);
- load_current_idt();
-}
-NOKPROBE_SYMBOL(debug_stack_set_zero);
-
-void debug_stack_reset(void)
-{
- if (WARN_ON(!this_cpu_read(debug_idt_ctr)))
- return;
- if (this_cpu_dec_return(debug_idt_ctr) == 0)
- load_current_idt();
-}
-NOKPROBE_SYMBOL(debug_stack_reset);
-
#else /* CONFIG_X86_64 */
DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 52de616..99be063 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -192,7 +192,12 @@
static char buf_mcatype[MAX_MCATYPE_NAME_LEN];
static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
-static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */
+
+/*
+ * A list of the banks enabled on each logical CPU. Controls which respective
+ * descriptors to initialize later in mce_threshold_create_device().
+ */
+static DEFINE_PER_CPU(unsigned int, bank_map);
/* Map of banks that have more than MCA_MISC0 available. */
static DEFINE_PER_CPU(u32, smca_misc_banks_map);
@@ -381,6 +386,10 @@
struct thresh_restart *tr = _tr;
u32 hi, lo;
+ /* sysfs write might race against an offline operation */
+ if (this_cpu_read(threshold_banks))
+ return;
+
rdmsr(tr->b->address, lo, hi);
if (tr->b->threshold_limit < (hi & THRESHOLD_MAX))
@@ -568,14 +577,19 @@
{
enum smca_bank_types bank_type = smca_get_bank_type(m->bank);
struct cpuinfo_x86 *c = &boot_cpu_data;
- u8 xec = (m->status >> 16) & 0x3F;
/* See Family 17h Models 10h-2Fh Erratum #1114. */
if (c->x86 == 0x17 &&
c->x86_model >= 0x10 && c->x86_model <= 0x2F &&
- bank_type == SMCA_IF && xec == 10)
+ bank_type == SMCA_IF && XEC(m->status, 0x3f) == 10)
return true;
+ /* NB GART TLB error reporting is disabled by default. */
+ if (c->x86 < 0x17) {
+ if (m->bank == 4 && XEC(m->status, 0x1f) == 0x5)
+ return true;
+ }
+
return false;
}
@@ -907,14 +921,13 @@
mce_log(&m);
}
-asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error)
{
- entering_irq();
trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
inc_irq_stat(irq_deferred_error_count);
deferred_error_int_vector();
trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR);
- exiting_ack_irq();
+ ack_APIC_irq();
}
/*
@@ -1016,13 +1029,22 @@
static void amd_threshold_interrupt(void)
{
struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL;
+ struct threshold_bank **bp = this_cpu_read(threshold_banks);
unsigned int bank, cpu = smp_processor_id();
+ /*
+ * Validate that the threshold bank has been initialized already. The
+ * handler is installed at boot time, but on a hotplug event the
+ * interrupt might fire before the data has been initialized.
+ */
+ if (!bp)
+ return;
+
for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
- first_block = per_cpu(threshold_banks, cpu)[bank]->blocks;
+ first_block = bp[bank]->blocks;
if (!first_block)
continue;
@@ -1071,7 +1093,8 @@
memset(&tr, 0, sizeof(tr));
tr.b = b;
- smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
+ if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1))
+ return -ENODEV;
return size;
}
@@ -1095,7 +1118,8 @@
b->threshold_limit = new;
tr.b = b;
- smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
+ if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1))
+ return -ENODEV;
return size;
}
@@ -1104,7 +1128,9 @@
{
u32 lo, hi;
- rdmsr_on_cpu(b->cpu, b->address, &lo, &hi);
+ /* CPU might be offline by now */
+ if (rdmsr_on_cpu(b->cpu, b->address, &lo, &hi))
+ return -ENODEV;
return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) -
(THRESHOLD_MAX - b->threshold_limit)));
@@ -1209,10 +1235,10 @@
u32 low, high;
int err;
- if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS))
+ if ((bank >= this_cpu_read(mce_num_banks)) || (block >= NR_BLOCKS))
return 0;
- if (rdmsr_safe_on_cpu(cpu, address, &low, &high))
+ if (rdmsr_safe(address, &low, &high))
return 0;
if (!(high & MASK_VALID_HI)) {
@@ -1247,6 +1273,7 @@
INIT_LIST_HEAD(&b->miscj);
+ /* This is safe as @tb is not visible yet */
if (tb->blocks)
list_add(&b->miscj, &tb->blocks->miscj);
else
@@ -1267,13 +1294,12 @@
if (b)
kobject_uevent(&b->kobj, KOBJ_ADD);
- return err;
+ return 0;
out_free:
if (b) {
- kobject_put(&b->kobj);
list_del(&b->miscj);
- kfree(b);
+ kobject_put(&b->kobj);
}
return err;
}
@@ -1302,9 +1328,10 @@
return err;
}
-static int threshold_create_bank(unsigned int cpu, unsigned int bank)
+static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
+ unsigned int bank)
{
- struct device *dev = per_cpu(mce_device, cpu);
+ struct device *dev = this_cpu_read(mce_device);
struct amd_northbridge *nb = NULL;
struct threshold_bank *b = NULL;
const char *name = get_name(bank, NULL);
@@ -1324,7 +1351,7 @@
if (err)
goto out;
- per_cpu(threshold_banks, cpu)[bank] = b;
+ bp[bank] = b;
refcount_inc(&b->cpus);
err = __threshold_add_blocks(b);
@@ -1339,6 +1366,7 @@
goto out;
}
+ /* Associate the bank with the per-CPU MCE device */
b->kobj = kobject_create_and_add(name, &dev->kobj);
if (!b->kobj) {
err = -EINVAL;
@@ -1346,6 +1374,7 @@
}
if (is_shared_bank(bank)) {
+ b->shared = 1;
refcount_set(&b->cpus, 1);
/* nb is already initialized, see above */
@@ -1357,16 +1386,16 @@
err = allocate_threshold_blocks(cpu, b, bank, 0, msr_ops.misc(bank));
if (err)
- goto out_free;
+ goto out_kobj;
- per_cpu(threshold_banks, cpu)[bank] = b;
-
+ bp[bank] = b;
return 0;
- out_free:
+out_kobj:
+ kobject_put(b->kobj);
+out_free:
kfree(b);
-
- out:
+out:
return err;
}
@@ -1375,21 +1404,16 @@
kfree(to_block(kobj));
}
-static void deallocate_threshold_block(unsigned int cpu, unsigned int bank)
+static void deallocate_threshold_blocks(struct threshold_bank *bank)
{
- struct threshold_block *pos = NULL;
- struct threshold_block *tmp = NULL;
- struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank];
+ struct threshold_block *pos, *tmp;
- if (!head)
- return;
-
- list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) {
+ list_for_each_entry_safe(pos, tmp, &bank->blocks->miscj, miscj) {
list_del(&pos->miscj);
kobject_put(&pos->kobj);
}
- kobject_put(&head->blocks->kobj);
+ kobject_put(&bank->blocks->kobj);
}
static void __threshold_remove_blocks(struct threshold_bank *b)
@@ -1403,122 +1427,102 @@
kobject_del(&pos->kobj);
}
-static void threshold_remove_bank(unsigned int cpu, int bank)
+static void threshold_remove_bank(struct threshold_bank *bank)
{
struct amd_northbridge *nb;
- struct threshold_bank *b;
- b = per_cpu(threshold_banks, cpu)[bank];
- if (!b)
+ if (!bank->blocks)
+ goto out_free;
+
+ if (!bank->shared)
+ goto out_dealloc;
+
+ if (!refcount_dec_and_test(&bank->cpus)) {
+ __threshold_remove_blocks(bank);
return;
-
- if (!b->blocks)
- goto free_out;
-
- if (is_shared_bank(bank)) {
- if (!refcount_dec_and_test(&b->cpus)) {
- __threshold_remove_blocks(b);
- per_cpu(threshold_banks, cpu)[bank] = NULL;
- return;
- } else {
- /*
- * the last CPU on this node using the shared bank is
- * going away, remove that bank now.
- */
- nb = node_to_amd_nb(amd_get_nb_id(cpu));
- nb->bank4 = NULL;
- }
+ } else {
+ /*
+ * The last CPU on this node using the shared bank is going
+ * away, remove that bank now.
+ */
+ nb = node_to_amd_nb(amd_get_nb_id(smp_processor_id()));
+ nb->bank4 = NULL;
}
- deallocate_threshold_block(cpu, bank);
+out_dealloc:
+ deallocate_threshold_blocks(bank);
-free_out:
- kobject_del(b->kobj);
- kobject_put(b->kobj);
- kfree(b);
- per_cpu(threshold_banks, cpu)[bank] = NULL;
+out_free:
+ kobject_put(bank->kobj);
+ kfree(bank);
}
int mce_threshold_remove_device(unsigned int cpu)
{
- unsigned int bank;
+ struct threshold_bank **bp = this_cpu_read(threshold_banks);
+ unsigned int bank, numbanks = this_cpu_read(mce_num_banks);
- for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) {
- if (!(per_cpu(bank_map, cpu) & (1 << bank)))
- continue;
- threshold_remove_bank(cpu, bank);
+ if (!bp)
+ return 0;
+
+ /*
+ * Clear the pointer before cleaning up, so that the interrupt won't
+ * touch anything of this.
+ */
+ this_cpu_write(threshold_banks, NULL);
+
+ for (bank = 0; bank < numbanks; bank++) {
+ if (bp[bank]) {
+ threshold_remove_bank(bp[bank]);
+ bp[bank] = NULL;
+ }
}
- kfree(per_cpu(threshold_banks, cpu));
- per_cpu(threshold_banks, cpu) = NULL;
+ kfree(bp);
return 0;
}
-/* create dir/files for all valid threshold banks */
+/**
+ * mce_threshold_create_device - Create the per-CPU MCE threshold device
+ * @cpu: The plugged in CPU
+ *
+ * Create directories and files for all valid threshold banks.
+ *
+ * This is invoked from the CPU hotplug callback which was installed in
+ * mcheck_init_device(). The invocation happens in context of the hotplug
+ * thread running on @cpu. The callback is invoked on all CPUs which are
+ * online when the callback is installed or during a real hotplug event.
+ */
int mce_threshold_create_device(unsigned int cpu)
{
- unsigned int bank;
+ unsigned int numbanks, bank;
struct threshold_bank **bp;
- int err = 0;
+ int err;
- bp = per_cpu(threshold_banks, cpu);
+ if (!mce_flags.amd_threshold)
+ return 0;
+
+ bp = this_cpu_read(threshold_banks);
if (bp)
return 0;
- bp = kcalloc(per_cpu(mce_num_banks, cpu), sizeof(struct threshold_bank *),
- GFP_KERNEL);
+ numbanks = this_cpu_read(mce_num_banks);
+ bp = kcalloc(numbanks, sizeof(*bp), GFP_KERNEL);
if (!bp)
return -ENOMEM;
- per_cpu(threshold_banks, cpu) = bp;
-
- for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) {
- if (!(per_cpu(bank_map, cpu) & (1 << bank)))
+ for (bank = 0; bank < numbanks; ++bank) {
+ if (!(this_cpu_read(bank_map) & (1 << bank)))
continue;
- err = threshold_create_bank(cpu, bank);
+ err = threshold_create_bank(bp, cpu, bank);
if (err)
- goto err;
+ goto out_err;
}
- return err;
-err:
- mce_threshold_remove_device(cpu);
- return err;
-}
-
-static __init int threshold_init_device(void)
-{
- unsigned lcpu = 0;
-
- /* to hit CPUs online before the notifier is up */
- for_each_online_cpu(lcpu) {
- int err = mce_threshold_create_device(lcpu);
-
- if (err)
- return err;
- }
+ this_cpu_write(threshold_banks, bp);
if (thresholding_irq_en)
mce_threshold_vector = amd_threshold_interrupt;
-
return 0;
+out_err:
+ mce_threshold_remove_device(cpu);
+ return err;
}
-/*
- * there are 3 funcs which need to be _initcalled in a logic sequence:
- * 1. xen_late_init_mcelog
- * 2. mcheck_init_device
- * 3. threshold_init_device
- *
- * xen_late_init_mcelog must register xen_mce_chrdev_device before
- * native mce_chrdev_device registration if running under xen platform;
- *
- * mcheck_init_device should be inited before threshold_init_device to
- * initialize mce_device, otherwise a NULL ptr dereference will cause panic.
- *
- * so we use following _initcalls
- * 1. device_initcall(xen_late_init_mcelog);
- * 2. device_initcall_sync(mcheck_init_device);
- * 3. late_initcall(threshold_init_device);
- *
- * when running under xen, the initcall order is 1,2,3;
- * on baremetal, we skip 1 and we do only 2 and 3.
- */
-late_initcall(threshold_init_device);
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index e9265e2..ce9120c 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -130,7 +130,7 @@
BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
/* Do initial initialization of a struct mce */
-void mce_setup(struct mce *m)
+noinstr void mce_setup(struct mce *m)
{
memset(m, 0, sizeof(struct mce));
m->cpu = m->extcpu = smp_processor_id();
@@ -140,12 +140,12 @@
m->cpuid = cpuid_eax(1);
m->socketid = cpu_data(m->extcpu).phys_proc_id;
m->apicid = cpu_data(m->extcpu).initial_apicid;
- rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
+ m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP);
if (this_cpu_has(X86_FEATURE_INTEL_PPIN))
- rdmsrl(MSR_PPIN, m->ppin);
+ m->ppin = __rdmsr(MSR_PPIN);
else if (this_cpu_has(X86_FEATURE_AMD_PPIN))
- rdmsrl(MSR_AMD_PPIN, m->ppin);
+ m->ppin = __rdmsr(MSR_AMD_PPIN);
m->microcode = boot_cpu_data.microcode;
}
@@ -160,29 +160,17 @@
}
EXPORT_SYMBOL_GPL(mce_log);
-/*
- * We run the default notifier if we have only the UC, the first and the
- * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
- * notifiers registered on the chain.
- */
-#define NUM_DEFAULT_NOTIFIERS 3
-static atomic_t num_notifiers;
-
void mce_register_decode_chain(struct notifier_block *nb)
{
if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC))
return;
- atomic_inc(&num_notifiers);
-
blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
}
EXPORT_SYMBOL_GPL(mce_register_decode_chain);
void mce_unregister_decode_chain(struct notifier_block *nb)
{
- atomic_dec(&num_notifiers);
-
blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
}
EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
@@ -265,6 +253,7 @@
}
pr_cont("\n");
+
/*
* Note this output is parsed by external tools and old fields
* should not be changed.
@@ -531,6 +520,14 @@
}
EXPORT_SYMBOL_GPL(mce_is_memory_error);
+static bool whole_page(struct mce *m)
+{
+ if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV))
+ return true;
+
+ return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT;
+}
+
bool mce_is_correctable(struct mce *m)
{
if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
@@ -546,22 +543,7 @@
}
EXPORT_SYMBOL_GPL(mce_is_correctable);
-static bool cec_add_mce(struct mce *m)
-{
- if (!m)
- return false;
-
- /* We eat only correctable DRAM errors with usable addresses. */
- if (mce_is_memory_error(m) &&
- mce_is_correctable(m) &&
- mce_usable_address(m))
- if (!cec_add_elem(m->addr >> PAGE_SHIFT))
- return true;
-
- return false;
-}
-
-static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
+static int mce_early_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
struct mce *m = (struct mce *)data;
@@ -569,9 +551,6 @@
if (!m)
return NOTIFY_DONE;
- if (cec_add_mce(m))
- return NOTIFY_STOP;
-
/* Emit the trace record: */
trace_mce_record(m);
@@ -582,9 +561,9 @@
return NOTIFY_DONE;
}
-static struct notifier_block first_nb = {
- .notifier_call = mce_first_notifier,
- .priority = MCE_PRIO_FIRST,
+static struct notifier_block early_nb = {
+ .notifier_call = mce_early_notifier,
+ .priority = MCE_PRIO_EARLY,
};
static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
@@ -601,8 +580,10 @@
return NOTIFY_DONE;
pfn = mce->addr >> PAGE_SHIFT;
- if (!memory_failure(pfn, 0))
- set_mce_nospec(pfn);
+ if (!memory_failure(pfn, 0)) {
+ set_mce_nospec(pfn, whole_page(mce));
+ mce->kflags |= MCE_HANDLED_UC;
+ }
return NOTIFY_OK;
}
@@ -620,10 +601,8 @@
if (!m)
return NOTIFY_DONE;
- if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
- return NOTIFY_DONE;
-
- __print_mce(m);
+ if (mca_cfg.print_all || !m->kflags)
+ __print_mce(m);
return NOTIFY_DONE;
}
@@ -1100,13 +1079,15 @@
* kdump kernel establishing a new #MC handler where a broadcasted MCE
* might not get handled properly.
*/
-static bool __mc_check_crashing_cpu(int cpu)
+static noinstr bool mce_check_crashing_cpu(void)
{
+ unsigned int cpu = smp_processor_id();
+
if (cpu_is_offline(cpu) ||
(crashing_cpu != -1 && crashing_cpu != cpu)) {
u64 mcgstatus;
- mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ mcgstatus = __rdmsr(MSR_IA32_MCG_STATUS);
if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
if (mcgstatus & MCG_STATUS_LMCES)
@@ -1114,7 +1095,7 @@
}
if (mcgstatus & MCG_STATUS_RIPV) {
- mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+ __wrmsr(MSR_IA32_MCG_STATUS, 0, 0);
return true;
}
}
@@ -1200,11 +1181,12 @@
int flags = MF_ACTION_REQUIRED;
pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
- if (!(p->mce_status & MCG_STATUS_RIPV))
+
+ if (!p->mce_ripv)
flags |= MF_MUST_KILL;
if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags)) {
- set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
+ set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
return;
}
@@ -1230,12 +1212,11 @@
* backing the user stack, tracing that reads the user stack will cause
* potentially infinite recursion.
*/
-void noinstr do_machine_check(struct pt_regs *regs, long error_code)
+void noinstr do_machine_check(struct pt_regs *regs)
{
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
struct mca_config *cfg = &mca_cfg;
- int cpu = smp_processor_id();
struct mce m, *final;
char *msg = NULL;
int worst = 0;
@@ -1264,11 +1245,6 @@
*/
int lmce = 1;
- if (__mc_check_crashing_cpu(cpu))
- return;
-
- nmi_enter();
-
this_cpu_inc(mce_exception_count);
mce_gather_info(&m, regs);
@@ -1356,7 +1332,7 @@
sync_core();
if (worst != MCE_AR_SEVERITY && !kill_it)
- goto out_ist;
+ return;
/* Fault was in user mode and we need to take some action */
if ((m.cs & 3) == 3) {
@@ -1364,18 +1340,27 @@
BUG_ON(!on_thread_stack() || !user_mode(regs));
current->mce_addr = m.addr;
- current->mce_status = m.mcgstatus;
+ current->mce_ripv = !!(m.mcgstatus & MCG_STATUS_RIPV);
+ current->mce_whole_page = whole_page(&m);
current->mce_kill_me.func = kill_me_maybe;
if (kill_it)
current->mce_kill_me.func = kill_me_now;
task_work_add(current, ¤t->mce_kill_me, true);
} else {
- if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0))
- mce_panic("Failed kernel mode recovery", &m, msg);
+ /*
+ * Handle an MCE which has happened in kernel space but from
+ * which the kernel can recover: ex_has_fault_handler() has
+ * already verified that the rIP at which the error happened is
+ * a rIP from which the kernel can recover (by jumping to
+ * recovery code specified in _ASM_EXTABLE_FAULT()) and the
+ * corresponding exception handler which would do that is the
+ * proper one.
+ */
+ if (m.kflags & MCE_IN_KERNEL_RECOV) {
+ if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
+ mce_panic("Failed kernel mode recovery", &m, msg);
+ }
}
-
-out_ist:
- nmi_exit();
}
EXPORT_SYMBOL_GPL(do_machine_check);
@@ -1765,6 +1750,7 @@
mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR);
mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA);
+ mce_flags.amd_threshold = 1;
if (mce_flags.smca) {
msr_ops.ctl = smca_ctl_reg;
@@ -1902,21 +1888,84 @@
}
/* Handle unconfigured int18 (should never happen) */
-static void unexpected_machine_check(struct pt_regs *regs, long error_code)
+static noinstr void unexpected_machine_check(struct pt_regs *regs)
{
+ instrumentation_begin();
pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
smp_processor_id());
+ instrumentation_end();
}
/* Call the installed machine check handler for this CPU setup. */
-void (*machine_check_vector)(struct pt_regs *, long error_code) =
- unexpected_machine_check;
+void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check;
-dotraplinkage notrace void do_mce(struct pt_regs *regs, long error_code)
+static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
{
- machine_check_vector(regs, error_code);
+ /*
+ * Only required when from kernel mode. See
+ * mce_check_crashing_cpu() for details.
+ */
+ if (machine_check_vector == do_machine_check &&
+ mce_check_crashing_cpu())
+ return;
+
+ nmi_enter();
+ /*
+ * The call targets are marked noinstr, but objtool can't figure
+ * that out because it's an indirect call. Annotate it.
+ */
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+ machine_check_vector(regs);
+ if (regs->flags & X86_EFLAGS_IF)
+ trace_hardirqs_on_prepare();
+ instrumentation_end();
+ nmi_exit();
}
-NOKPROBE_SYMBOL(do_mce);
+
+static __always_inline void exc_machine_check_user(struct pt_regs *regs)
+{
+ idtentry_enter_user(regs);
+ instrumentation_begin();
+ machine_check_vector(regs);
+ instrumentation_end();
+ idtentry_exit_user(regs);
+}
+
+#ifdef CONFIG_X86_64
+/* MCE hit kernel mode */
+DEFINE_IDTENTRY_MCE(exc_machine_check)
+{
+ unsigned long dr7;
+
+ dr7 = local_db_save();
+ exc_machine_check_kernel(regs);
+ local_db_restore(dr7);
+}
+
+/* The user mode variant. */
+DEFINE_IDTENTRY_MCE_USER(exc_machine_check)
+{
+ unsigned long dr7;
+
+ dr7 = local_db_save();
+ exc_machine_check_user(regs);
+ local_db_restore(dr7);
+}
+#else
+/* 32bit unified entry point */
+DEFINE_IDTENTRY_MCE(exc_machine_check)
+{
+ unsigned long dr7;
+
+ dr7 = local_db_save();
+ if (user_mode(regs))
+ exc_machine_check_user(regs);
+ else
+ exc_machine_check_kernel(regs);
+ local_db_restore(dr7);
+}
+#endif
/*
* Called for each booted CPU to set up machine checks.
@@ -1999,6 +2048,7 @@
* mce=no_cmci Disables CMCI
* mce=no_lmce Disables LMCE
* mce=dont_log_ce Clears corrected events silently, no log created for CEs.
+ * mce=print_all Print all machine check logs to console
* mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
* mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
* monarchtimeout is how long to wait for other CPUs on machine
@@ -2027,6 +2077,8 @@
cfg->lmce_disabled = 1;
else if (!strcmp(str, "dont_log_ce"))
cfg->dont_log_ce = true;
+ else if (!strcmp(str, "print_all"))
+ cfg->print_all = true;
else if (!strcmp(str, "ignore_ce"))
cfg->ignore_ce = true;
else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
@@ -2049,7 +2101,7 @@
int __init mcheck_init(void)
{
mcheck_intel_therm_init();
- mce_register_decode_chain(&first_nb);
+ mce_register_decode_chain(&early_nb);
mce_register_decode_chain(&mce_uc_nb);
mce_register_decode_chain(&mce_default_nb);
mcheck_vendor_init_severity();
@@ -2293,6 +2345,7 @@
static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
+static DEVICE_BOOL_ATTR(print_all, 0644, mca_cfg.print_all);
static struct dev_ext_attribute dev_attr_check_interval = {
__ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
@@ -2317,6 +2370,7 @@
#endif
&dev_attr_monarch_timeout.attr,
&dev_attr_dont_log_ce.attr,
+ &dev_attr_print_all.attr,
&dev_attr_ignore_ce.attr,
&dev_attr_cmci_disabled.attr,
NULL
@@ -2489,6 +2543,13 @@
}
}
+/*
+ * When running on XEN, this initcall is ordered against the XEN mcelog
+ * initcall:
+ *
+ * device_initcall(xen_late_init_mcelog);
+ * device_initcall_sync(mcheck_init_device);
+ */
static __init int mcheck_init_device(void)
{
int err;
@@ -2520,6 +2581,10 @@
if (err)
goto err_out_mem;
+ /*
+ * Invokes mce_cpu_online() on all CPUs which are online when
+ * the state is installed.
+ */
err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
mce_cpu_online, mce_cpu_pre_down);
if (err < 0)
@@ -2609,7 +2674,6 @@
static_branch_inc(&mcsafe_key);
mcheck_debugfs_init();
- cec_init();
/*
* Flush out everything that has been logged during early boot, now that
diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c
index d089567..43c4660 100644
--- a/arch/x86/kernel/cpu/mce/dev-mcelog.c
+++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c
@@ -39,6 +39,9 @@
struct mce *mce = (struct mce *)data;
unsigned int entry;
+ if (mce->kflags & MCE_HANDLED_CEC)
+ return NOTIFY_DONE;
+
mutex_lock(&mce_chrdev_read_mutex);
entry = mcelog->next;
@@ -56,6 +59,7 @@
memcpy(mcelog->entry + entry, mce, sizeof(struct mce));
mcelog->entry[entry].finished = 1;
+ mcelog->entry[entry].kflags = 0;
/* wake processes polling /dev/mcelog */
wake_up_interruptible(&mce_chrdev_wait);
@@ -63,6 +67,7 @@
unlock:
mutex_unlock(&mce_chrdev_read_mutex);
+ mce->kflags |= MCE_HANDLED_MCELOG;
return NOTIFY_OK;
}
@@ -324,6 +329,7 @@
.write = mce_chrdev_write,
.poll = mce_chrdev_poll,
.unlocked_ioctl = mce_chrdev_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
.llseek = no_llseek,
};
@@ -343,7 +349,7 @@
if (!mcelog)
return -ENOMEM;
- strncpy(mcelog->signature, MCE_LOG_SIGNATURE, sizeof(mcelog->signature));
+ memcpy(mcelog->signature, MCE_LOG_SIGNATURE, sizeof(mcelog->signature));
mcelog->len = mce_log_len;
mcelog->recordlen = sizeof(struct mce);
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
index 3413b41..0593b19 100644
--- a/arch/x86/kernel/cpu/mce/inject.c
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -146,9 +146,9 @@
regs.cs = m->cs;
pregs = ®s;
}
- /* in mcheck exeception handler, irq will be disabled */
+ /* do_machine_check() expects interrupts disabled -- at least */
local_irq_save(flags);
- do_machine_check(pregs, 0);
+ do_machine_check(pregs);
local_irq_restore(flags);
m->finished = 0;
}
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 3b00817..6473070 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -9,7 +9,7 @@
#include <asm/mce.h>
/* Pointer to the installed machine check handler for this CPU setup. */
-extern void (*machine_check_vector)(struct pt_regs *, long error_code);
+extern void (*machine_check_vector)(struct pt_regs *);
enum severity_level {
MCE_NO_SEVERITY,
@@ -119,6 +119,7 @@
bool dont_log_ce;
bool cmci_disabled;
bool ignore_ce;
+ bool print_all;
__u64 lmce_disabled : 1,
disabled : 1,
@@ -148,7 +149,7 @@
* Recovery. It indicates support for data poisoning in HW and deferred
* error interrupts.
*/
- succor : 1,
+ succor : 1,
/*
* (AMD) SMCA: This bit indicates support for Scalable MCA which expands
@@ -156,9 +157,12 @@
* banks. Also, to accommodate the new banks and registers, the MCA
* register space is moved to a new MSR range.
*/
- smca : 1,
+ smca : 1,
- __reserved_0 : 61;
+ /* AMD-style error thresholding banks present. */
+ amd_threshold : 1,
+
+ __reserved_0 : 60;
};
extern struct mce_vendor_flags mce_flags;
diff --git a/arch/x86/kernel/cpu/mce/p5.c b/arch/x86/kernel/cpu/mce/p5.c
index 5ee94aa1..19e90ca 100644
--- a/arch/x86/kernel/cpu/mce/p5.c
+++ b/arch/x86/kernel/cpu/mce/p5.c
@@ -21,12 +21,11 @@
int mce_p5_enabled __read_mostly;
/* Machine check handler for Pentium class Intel CPUs: */
-static void pentium_machine_check(struct pt_regs *regs, long error_code)
+static noinstr void pentium_machine_check(struct pt_regs *regs)
{
u32 loaddr, hi, lotype;
- nmi_enter();
-
+ instrumentation_begin();
rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
@@ -39,8 +38,7 @@
}
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
-
- nmi_exit();
+ instrumentation_end();
}
/* Set up machine check reporting for processors with Intel style MCE: */
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index 87bcdc6..e1da619 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -213,8 +213,12 @@
{
if ((m->cs & 3) == 3)
return IN_USER;
- if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip))
+
+ if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip)) {
+ m->kflags |= MCE_IN_KERNEL_RECOV;
return IN_KERNEL_RECOV;
+ }
+
return IN_KERNEL;
}
diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c
index f36dc07..a7cd2d2 100644
--- a/arch/x86/kernel/cpu/mce/therm_throt.c
+++ b/arch/x86/kernel/cpu/mce/therm_throt.c
@@ -614,14 +614,13 @@
static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
-asmlinkage __visible void __irq_entry smp_thermal_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_thermal)
{
- entering_irq();
trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
inc_irq_stat(irq_thermal_count);
smp_thermal_vector();
trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
- exiting_ack_irq();
+ ack_APIC_irq();
}
/* Thermal monitoring depends on APIC, ACPI and clock modulation */
diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c
index 28812cc..6a059a0 100644
--- a/arch/x86/kernel/cpu/mce/threshold.c
+++ b/arch/x86/kernel/cpu/mce/threshold.c
@@ -21,12 +21,11 @@
void (*mce_threshold_vector)(void) = default_threshold_interrupt;
-asmlinkage __visible void __irq_entry smp_threshold_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_threshold)
{
- entering_irq();
trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
inc_irq_stat(irq_threshold_count);
mce_threshold_vector();
trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
- exiting_ack_irq();
+ ack_APIC_irq();
}
diff --git a/arch/x86/kernel/cpu/mce/winchip.c b/arch/x86/kernel/cpu/mce/winchip.c
index b3938c1..9c9f0ab 100644
--- a/arch/x86/kernel/cpu/mce/winchip.c
+++ b/arch/x86/kernel/cpu/mce/winchip.c
@@ -17,14 +17,12 @@
#include "internal.h"
/* Machine check handler for WinChip C6: */
-static void winchip_machine_check(struct pt_regs *regs, long error_code)
+static noinstr void winchip_machine_check(struct pt_regs *regs)
{
- nmi_enter();
-
+ instrumentation_begin();
pr_emerg("CPU0: Machine Check Exception.\n");
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
-
- nmi_exit();
+ instrumentation_end();
}
/* Set up machine check reporting on the Winchip C6 series */
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index ebf34c7..af94f05 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -23,6 +23,7 @@
#include <asm/hyperv-tlfs.h>
#include <asm/mshyperv.h>
#include <asm/desc.h>
+#include <asm/idtentry.h>
#include <asm/irq_regs.h>
#include <asm/i8259.h>
#include <asm/apic.h>
@@ -40,11 +41,10 @@
static void (*hv_kexec_handler)(void);
static void (*hv_crash_handler)(struct pt_regs *regs);
-__visible void __irq_entry hyperv_vector_handler(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback)
{
struct pt_regs *old_regs = set_irq_regs(regs);
- entering_irq();
inc_irq_stat(irq_hv_callback_count);
if (vmbus_handler)
vmbus_handler();
@@ -52,7 +52,6 @@
if (ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED)
ack_APIC_irq();
- exiting_irq();
set_irq_regs(old_regs);
}
@@ -73,19 +72,16 @@
* Routines to do per-architecture handling of stimer0
* interrupts when in Direct Mode
*/
-
-__visible void __irq_entry hv_stimer0_vector_handler(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0)
{
struct pt_regs *old_regs = set_irq_regs(regs);
- entering_irq();
inc_irq_stat(hyperv_stimer0_count);
if (hv_stimer0_handler)
hv_stimer0_handler();
add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0);
ack_APIC_irq();
- exiting_irq();
set_irq_regs(old_regs);
}
@@ -331,17 +327,19 @@
x86_platform.apic_post_init = hyperv_init;
hyperv_setup_mmu_ops();
/* Setup the IDT for hypervisor callback */
- alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector);
+ alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_hyperv_callback);
/* Setup the IDT for reenlightenment notifications */
- if (ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT)
+ if (ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT) {
alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR,
- hyperv_reenlightenment_vector);
+ asm_sysvec_hyperv_reenlightenment);
+ }
/* Setup the IDT for stimer0 */
- if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE)
+ if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) {
alloc_intr_gate(HYPERV_STIMER0_VECTOR,
- hv_stimer0_callback_vector);
+ asm_sysvec_hyperv_stimer0);
+ }
# ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu;
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index 2ccc57f..759d392 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -10,7 +10,6 @@
#include <asm/desc.h>
#include <asm/traps.h>
-extern void double_fault(void);
#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
#define TSS(x) this_cpu_read(cpu_tss_rw.x86_tss.x)
@@ -21,7 +20,7 @@
* Called by double_fault with CR0.TS and EFLAGS.NT cleared. The CPU thinks
* we're running the doublefault task. Cannot return.
*/
-asmlinkage notrace void __noreturn doublefault_shim(void)
+asmlinkage noinstr void __noreturn doublefault_shim(void)
{
unsigned long cr2;
struct pt_regs regs;
@@ -40,7 +39,7 @@
* Fill in pt_regs. A downside of doing this in C is that the unwinder
* won't see it (no ENCODE_FRAME_POINTER), so a nested stack dump
* won't successfully unwind to the source of the double fault.
- * The main dump from do_double_fault() is fine, though, since it
+ * The main dump from exc_double_fault() is fine, though, since it
* uses these regs directly.
*
* If anyone ever cares, this could be moved to asm.
@@ -70,7 +69,7 @@
regs.cx = TSS(cx);
regs.bx = TSS(bx);
- do_double_fault(®s, 0, cr2);
+ exc_double_fault(®s, 0, cr2);
/*
* x86_32 does not save the original CR3 anywhere on a task switch.
@@ -84,7 +83,6 @@
*/
panic("cannot return from double fault\n");
}
-NOKPROBE_SYMBOL(doublefault_shim);
DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = {
.tss = {
@@ -95,7 +93,7 @@
.ldt = 0,
.io_bitmap_base = IO_BITMAP_OFFSET_INVALID,
- .ip = (unsigned long) double_fault,
+ .ip = (unsigned long) asm_exc_double_fault,
.flags = X86_EFLAGS_FIXED,
.es = __USER_DS,
.cs = __KERNEL_CS,
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 460ae7f6..4a94d38 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -22,15 +22,13 @@
static const char * const exception_stack_names[] = {
[ ESTACK_DF ] = "#DF",
[ ESTACK_NMI ] = "NMI",
- [ ESTACK_DB2 ] = "#DB2",
- [ ESTACK_DB1 ] = "#DB1",
[ ESTACK_DB ] = "#DB",
[ ESTACK_MCE ] = "#MC",
};
const char *stack_type_name(enum stack_type type)
{
- BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
+ BUILD_BUG_ON(N_EXCEPTION_STACKS != 4);
if (type == STACK_TYPE_IRQ)
return "IRQ";
@@ -79,7 +77,6 @@
struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = {
EPAGERANGE(DF),
EPAGERANGE(NMI),
- EPAGERANGE(DB1),
EPAGERANGE(DB),
EPAGERANGE(MCE),
};
@@ -91,7 +88,7 @@
struct pt_regs *regs;
unsigned int k;
- BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
+ BUILD_BUG_ON(N_EXCEPTION_STACKS != 4);
begin = (unsigned long)__this_cpu_read(cea_exception_stacks);
/*
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index aa5d28a..083a3da 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -12,7 +12,7 @@
#include <asm/frame.h>
.code64
- .section .entry.text, "ax"
+ .section .text, "ax"
#ifdef CONFIG_FRAME_POINTER
/* Save parent and function stack frames (rip and rbp) */
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 4fc33fdf0..16da4ac 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -29,15 +29,16 @@
#ifdef CONFIG_PARAVIRT_XXL
#include <asm/asm-offsets.h>
#include <asm/paravirt.h>
+#define GET_CR2_INTO(reg) GET_CR2_INTO_AX ; _ASM_MOV %_ASM_AX, reg
#else
#define INTERRUPT_RETURN iretq
+#define GET_CR2_INTO(reg) _ASM_MOV %cr2, reg
#endif
-/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE
+/*
+ * We are not able to switch in one step to the final KERNEL ADDRESS SPACE
* because we need identity-mapped pages.
- *
*/
-
#define l4_index(x) (((x) >> 39) & 511)
#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 4d8d53e..8cdf29f 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -32,6 +32,8 @@
#include <asm/processor.h>
#include <asm/debugreg.h>
#include <asm/user.h>
+#include <asm/desc.h>
+#include <asm/tlbflush.h>
/* Per cpu debug control register value */
DEFINE_PER_CPU(unsigned long, cpu_dr7);
@@ -97,6 +99,8 @@
unsigned long *dr7;
int i;
+ lockdep_assert_irqs_disabled();
+
for (i = 0; i < HBP_NUM; i++) {
struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]);
@@ -115,6 +119,12 @@
dr7 = this_cpu_ptr(&cpu_dr7);
*dr7 |= encode_dr7(i, info->len, info->type);
+ /*
+ * Ensure we first write cpu_dr7 before we set the DR7 register.
+ * This ensures an NMI never see cpu_dr7 0 when DR7 is not.
+ */
+ barrier();
+
set_debugreg(*dr7, 7);
if (info->mask)
set_dr_addr_mask(info->mask, i);
@@ -134,9 +144,11 @@
void arch_uninstall_hw_breakpoint(struct perf_event *bp)
{
struct arch_hw_breakpoint *info = counter_arch_bp(bp);
- unsigned long *dr7;
+ unsigned long dr7;
int i;
+ lockdep_assert_irqs_disabled();
+
for (i = 0; i < HBP_NUM; i++) {
struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]);
@@ -149,12 +161,20 @@
if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
return;
- dr7 = this_cpu_ptr(&cpu_dr7);
- *dr7 &= ~__encode_dr7(i, info->len, info->type);
+ dr7 = this_cpu_read(cpu_dr7);
+ dr7 &= ~__encode_dr7(i, info->len, info->type);
- set_debugreg(*dr7, 7);
+ set_debugreg(dr7, 7);
if (info->mask)
set_dr_addr_mask(0, i);
+
+ /*
+ * Ensure the write to cpu_dr7 is after we've set the DR7 register.
+ * This ensures an NMI never see cpu_dr7 0 when DR7 is not.
+ */
+ barrier();
+
+ this_cpu_write(cpu_dr7, dr7);
}
static int arch_bp_generic_len(int x86_len)
@@ -227,10 +247,76 @@
return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX);
}
+/*
+ * Checks whether the range [addr, end], overlaps the area [base, base + size).
+ */
+static inline bool within_area(unsigned long addr, unsigned long end,
+ unsigned long base, unsigned long size)
+{
+ return end >= base && addr < (base + size);
+}
+
+/*
+ * Checks whether the range from addr to end, inclusive, overlaps the fixed
+ * mapped CPU entry area range or other ranges used for CPU entry.
+ */
+static inline bool within_cpu_entry(unsigned long addr, unsigned long end)
+{
+ int cpu;
+
+ /* CPU entry erea is always used for CPU entry */
+ if (within_area(addr, end, CPU_ENTRY_AREA_BASE,
+ CPU_ENTRY_AREA_TOTAL_SIZE))
+ return true;
+
+ for_each_possible_cpu(cpu) {
+ /* The original rw GDT is being used after load_direct_gdt() */
+ if (within_area(addr, end, (unsigned long)get_cpu_gdt_rw(cpu),
+ GDT_SIZE))
+ return true;
+
+ /*
+ * cpu_tss_rw is not directly referenced by hardware, but
+ * cpu_tss_rw is also used in CPU entry code,
+ */
+ if (within_area(addr, end,
+ (unsigned long)&per_cpu(cpu_tss_rw, cpu),
+ sizeof(struct tss_struct)))
+ return true;
+
+ /*
+ * cpu_tlbstate.user_pcid_flush_mask is used for CPU entry.
+ * If a data breakpoint on it, it will cause an unwanted #DB.
+ * Protect the full cpu_tlbstate structure to be sure.
+ */
+ if (within_area(addr, end,
+ (unsigned long)&per_cpu(cpu_tlbstate, cpu),
+ sizeof(struct tlb_state)))
+ return true;
+ }
+
+ return false;
+}
+
static int arch_build_bp_info(struct perf_event *bp,
const struct perf_event_attr *attr,
struct arch_hw_breakpoint *hw)
{
+ unsigned long bp_end;
+
+ bp_end = attr->bp_addr + attr->bp_len - 1;
+ if (bp_end < attr->bp_addr)
+ return -EINVAL;
+
+ /*
+ * Prevent any breakpoint of any type that overlaps the CPU
+ * entry area and data. This protects the IST stacks and also
+ * reduces the chance that we ever find out what happens if
+ * there's a data breakpoint on the GDT, IDT, or TSS.
+ */
+ if (within_cpu_entry(attr->bp_addr, bp_end))
+ return -EINVAL;
+
hw->address = attr->bp_addr;
hw->mask = 0;
@@ -439,7 +525,7 @@
{
int i, cpu, rc = NOTIFY_STOP;
struct perf_event *bp;
- unsigned long dr7, dr6;
+ unsigned long dr6;
unsigned long *dr6_p;
/* The DR6 value is pointed by args->err */
@@ -454,9 +540,6 @@
if ((dr6 & DR_TRAP_BITS) == 0)
return NOTIFY_DONE;
- get_debugreg(dr7, 7);
- /* Disable breakpoints during exception handling */
- set_debugreg(0UL, 7);
/*
* Assert that local interrupts are disabled
* Reset the DRn bits in the virtualized register value.
@@ -513,7 +596,6 @@
(dr6 & (~DR_TRAP_BITS)))
rc = NOTIFY_DONE;
- set_debugreg(dr7, 7);
put_cpu();
return rc;
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 87ef69a..0db2120 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -4,6 +4,8 @@
*/
#include <linux/interrupt.h>
+#include <asm/cpu_entry_area.h>
+#include <asm/set_memory.h>
#include <asm/traps.h>
#include <asm/proto.h>
#include <asm/desc.h>
@@ -51,15 +53,23 @@
#define TSKG(_vector, _gdt) \
G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3)
+#define IDT_TABLE_SIZE (IDT_ENTRIES * sizeof(gate_desc))
+
+static bool idt_setup_done __initdata;
+
/*
* Early traps running on the DEFAULT_STACK because the other interrupt
* stacks work only after cpu_init().
*/
static const __initconst struct idt_data early_idts[] = {
- INTG(X86_TRAP_DB, debug),
- SYSG(X86_TRAP_BP, int3),
+ INTG(X86_TRAP_DB, asm_exc_debug),
+ SYSG(X86_TRAP_BP, asm_exc_int3),
+
#ifdef CONFIG_X86_32
- INTG(X86_TRAP_PF, page_fault),
+ /*
+ * Not possible on 64-bit. See idt_setup_early_pf() for details.
+ */
+ INTG(X86_TRAP_PF, asm_exc_page_fault),
#endif
};
@@ -70,33 +80,33 @@
* set up TSS.
*/
static const __initconst struct idt_data def_idts[] = {
- INTG(X86_TRAP_DE, divide_error),
- INTG(X86_TRAP_NMI, nmi),
- INTG(X86_TRAP_BR, bounds),
- INTG(X86_TRAP_UD, invalid_op),
- INTG(X86_TRAP_NM, device_not_available),
- INTG(X86_TRAP_OLD_MF, coprocessor_segment_overrun),
- INTG(X86_TRAP_TS, invalid_TSS),
- INTG(X86_TRAP_NP, segment_not_present),
- INTG(X86_TRAP_SS, stack_segment),
- INTG(X86_TRAP_GP, general_protection),
- INTG(X86_TRAP_SPURIOUS, spurious_interrupt_bug),
- INTG(X86_TRAP_MF, coprocessor_error),
- INTG(X86_TRAP_AC, alignment_check),
- INTG(X86_TRAP_XF, simd_coprocessor_error),
+ INTG(X86_TRAP_DE, asm_exc_divide_error),
+ INTG(X86_TRAP_NMI, asm_exc_nmi),
+ INTG(X86_TRAP_BR, asm_exc_bounds),
+ INTG(X86_TRAP_UD, asm_exc_invalid_op),
+ INTG(X86_TRAP_NM, asm_exc_device_not_available),
+ INTG(X86_TRAP_OLD_MF, asm_exc_coproc_segment_overrun),
+ INTG(X86_TRAP_TS, asm_exc_invalid_tss),
+ INTG(X86_TRAP_NP, asm_exc_segment_not_present),
+ INTG(X86_TRAP_SS, asm_exc_stack_segment),
+ INTG(X86_TRAP_GP, asm_exc_general_protection),
+ INTG(X86_TRAP_SPURIOUS, asm_exc_spurious_interrupt_bug),
+ INTG(X86_TRAP_MF, asm_exc_coprocessor_error),
+ INTG(X86_TRAP_AC, asm_exc_alignment_check),
+ INTG(X86_TRAP_XF, asm_exc_simd_coprocessor_error),
#ifdef CONFIG_X86_32
TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS),
#else
- INTG(X86_TRAP_DF, double_fault),
+ INTG(X86_TRAP_DF, asm_exc_double_fault),
#endif
- INTG(X86_TRAP_DB, debug),
+ INTG(X86_TRAP_DB, asm_exc_debug),
#ifdef CONFIG_X86_MCE
- INTG(X86_TRAP_MC, &machine_check),
+ INTG(X86_TRAP_MC, asm_exc_machine_check),
#endif
- SYSG(X86_TRAP_OF, overflow),
+ SYSG(X86_TRAP_OF, asm_exc_overflow),
#if defined(CONFIG_IA32_EMULATION)
SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat),
#elif defined(CONFIG_X86_32)
@@ -109,95 +119,63 @@
*/
static const __initconst struct idt_data apic_idts[] = {
#ifdef CONFIG_SMP
- INTG(RESCHEDULE_VECTOR, reschedule_interrupt),
- INTG(CALL_FUNCTION_VECTOR, call_function_interrupt),
- INTG(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt),
- INTG(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt),
- INTG(REBOOT_VECTOR, reboot_interrupt),
+ INTG(RESCHEDULE_VECTOR, asm_sysvec_reschedule_ipi),
+ INTG(CALL_FUNCTION_VECTOR, asm_sysvec_call_function),
+ INTG(CALL_FUNCTION_SINGLE_VECTOR, asm_sysvec_call_function_single),
+ INTG(IRQ_MOVE_CLEANUP_VECTOR, asm_sysvec_irq_move_cleanup),
+ INTG(REBOOT_VECTOR, asm_sysvec_reboot),
#endif
#ifdef CONFIG_X86_THERMAL_VECTOR
- INTG(THERMAL_APIC_VECTOR, thermal_interrupt),
+ INTG(THERMAL_APIC_VECTOR, asm_sysvec_thermal),
#endif
#ifdef CONFIG_X86_MCE_THRESHOLD
- INTG(THRESHOLD_APIC_VECTOR, threshold_interrupt),
+ INTG(THRESHOLD_APIC_VECTOR, asm_sysvec_threshold),
#endif
#ifdef CONFIG_X86_MCE_AMD
- INTG(DEFERRED_ERROR_VECTOR, deferred_error_interrupt),
+ INTG(DEFERRED_ERROR_VECTOR, asm_sysvec_deferred_error),
#endif
#ifdef CONFIG_X86_LOCAL_APIC
- INTG(LOCAL_TIMER_VECTOR, apic_timer_interrupt),
- INTG(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi),
+ INTG(LOCAL_TIMER_VECTOR, asm_sysvec_apic_timer_interrupt),
+ INTG(X86_PLATFORM_IPI_VECTOR, asm_sysvec_x86_platform_ipi),
# ifdef CONFIG_HAVE_KVM
- INTG(POSTED_INTR_VECTOR, kvm_posted_intr_ipi),
- INTG(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi),
- INTG(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi),
+ INTG(POSTED_INTR_VECTOR, asm_sysvec_kvm_posted_intr_ipi),
+ INTG(POSTED_INTR_WAKEUP_VECTOR, asm_sysvec_kvm_posted_intr_wakeup_ipi),
+ INTG(POSTED_INTR_NESTED_VECTOR, asm_sysvec_kvm_posted_intr_nested_ipi),
# endif
# ifdef CONFIG_IRQ_WORK
- INTG(IRQ_WORK_VECTOR, irq_work_interrupt),
+ INTG(IRQ_WORK_VECTOR, asm_sysvec_irq_work),
# endif
-#ifdef CONFIG_X86_UV
- INTG(UV_BAU_MESSAGE, uv_bau_message_intr1),
-#endif
- INTG(SPURIOUS_APIC_VECTOR, spurious_interrupt),
- INTG(ERROR_APIC_VECTOR, error_interrupt),
+# ifdef CONFIG_X86_UV
+ INTG(UV_BAU_MESSAGE, asm_sysvec_uv_bau_message),
+# endif
+ INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt),
+ INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt),
#endif
};
-#ifdef CONFIG_X86_64
-/*
- * Early traps running on the DEFAULT_STACK because the other interrupt
- * stacks work only after cpu_init().
- */
-static const __initconst struct idt_data early_pf_idts[] = {
- INTG(X86_TRAP_PF, page_fault),
-};
-
-/*
- * Override for the debug_idt. Same as the default, but with interrupt
- * stack set to DEFAULT_STACK (0). Required for NMI trap handling.
- */
-static const __initconst struct idt_data dbg_idts[] = {
- INTG(X86_TRAP_DB, debug),
-};
-#endif
-
-/* Must be page-aligned because the real IDT is used in a fixmap. */
-gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss;
+/* Must be page-aligned because the real IDT is used in the cpu entry area */
+static gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss;
struct desc_ptr idt_descr __ro_after_init = {
- .size = (IDT_ENTRIES * 2 * sizeof(unsigned long)) - 1,
+ .size = IDT_TABLE_SIZE - 1,
.address = (unsigned long) idt_table,
};
-#ifdef CONFIG_X86_64
-/* No need to be aligned, but done to keep all IDTs defined the same way. */
-gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss;
+void load_current_idt(void)
+{
+ lockdep_assert_irqs_disabled();
+ load_idt(&idt_descr);
+}
-/*
- * The exceptions which use Interrupt stacks. They are setup after
- * cpu_init() when the TSS has been initialized.
- */
-static const __initconst struct idt_data ist_idts[] = {
- ISTG(X86_TRAP_DB, debug, IST_INDEX_DB),
- ISTG(X86_TRAP_NMI, nmi, IST_INDEX_NMI),
- ISTG(X86_TRAP_DF, double_fault, IST_INDEX_DF),
-#ifdef CONFIG_X86_MCE
- ISTG(X86_TRAP_MC, &machine_check, IST_INDEX_MCE),
-#endif
-};
-
-/*
- * Override for the debug_idt. Same as the default, but with interrupt
- * stack set to DEFAULT_STACK (0). Required for NMI trap handling.
- */
-const struct desc_ptr debug_idt_descr = {
- .size = IDT_ENTRIES * 16 - 1,
- .address = (unsigned long) debug_idt_table,
-};
+#ifdef CONFIG_X86_F00F_BUG
+bool idt_is_f00f_address(unsigned long address)
+{
+ return ((address - idt_descr.address) >> 3) == 6;
+}
#endif
static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d)
@@ -214,7 +192,7 @@
#endif
}
-static void
+static __init void
idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys)
{
gate_desc desc;
@@ -227,7 +205,7 @@
}
}
-static void set_intr_gate(unsigned int n, const void *addr)
+static __init void set_intr_gate(unsigned int n, const void *addr)
{
struct idt_data data;
@@ -266,6 +244,27 @@
}
#ifdef CONFIG_X86_64
+/*
+ * Early traps running on the DEFAULT_STACK because the other interrupt
+ * stacks work only after cpu_init().
+ */
+static const __initconst struct idt_data early_pf_idts[] = {
+ INTG(X86_TRAP_PF, asm_exc_page_fault),
+};
+
+/*
+ * The exceptions which use Interrupt stacks. They are setup after
+ * cpu_init() when the TSS has been initialized.
+ */
+static const __initconst struct idt_data ist_idts[] = {
+ ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB),
+ ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI),
+ ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF),
+#ifdef CONFIG_X86_MCE
+ ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE),
+#endif
+};
+
/**
* idt_setup_early_pf - Initialize the idt table with early pagefault handler
*
@@ -273,8 +272,10 @@
* cpu_init() is invoked and sets up TSS. The IST variant is installed
* after that.
*
- * FIXME: Why is 32bit and 64bit installing the PF handler at different
- * places in the early setup code?
+ * Note, that X86_64 cannot install the real #PF handler in
+ * idt_setup_early_traps() because the memory intialization needs the #PF
+ * handler from the early_idt_handler_array to initialize the early page
+ * tables.
*/
void __init idt_setup_early_pf(void)
{
@@ -289,18 +290,21 @@
{
idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts), true);
}
-
-/**
- * idt_setup_debugidt_traps - Initialize the debug idt table with debug traps
- */
-void __init idt_setup_debugidt_traps(void)
-{
- memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16);
-
- idt_setup_from_table(debug_idt_table, dbg_idts, ARRAY_SIZE(dbg_idts), false);
-}
#endif
+static void __init idt_map_in_cea(void)
+{
+ /*
+ * Set the IDT descriptor to a fixed read-only location in the cpu
+ * entry area, so that the "sidt" instruction will not leak the
+ * location of the kernel, and to defend the IDT against arbitrary
+ * memory write vulnerabilities.
+ */
+ cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table),
+ PAGE_KERNEL_RO);
+ idt_descr.address = CPU_ENTRY_AREA_RO_IDT;
+}
+
/**
* idt_setup_apic_and_irq_gates - Setup APIC/SMP and normal interrupt gates
*/
@@ -318,11 +322,23 @@
#ifdef CONFIG_X86_LOCAL_APIC
for_each_clear_bit_from(i, system_vectors, NR_VECTORS) {
- set_bit(i, system_vectors);
+ /*
+ * Don't set the non assigned system vectors in the
+ * system_vectors bitmap. Otherwise they show up in
+ * /proc/interrupts.
+ */
entry = spurious_entries_start + 8 * (i - FIRST_SYSTEM_VECTOR);
set_intr_gate(i, entry);
}
#endif
+ /* Map IDT into CPU entry area and reload it. */
+ idt_map_in_cea();
+ load_idt(&idt_descr);
+
+ /* Make the IDT table read only */
+ set_memory_ro((unsigned long)&idt_table, 1);
+
+ idt_setup_done = true;
}
/**
@@ -352,16 +368,14 @@
load_idt(&idt);
}
-void __init update_intr_gate(unsigned int n, const void *addr)
+void __init alloc_intr_gate(unsigned int n, const void *addr)
{
- if (WARN_ON_ONCE(!test_bit(n, system_vectors)))
+ if (WARN_ON(n < FIRST_SYSTEM_VECTOR))
return;
- set_intr_gate(n, addr);
-}
-void alloc_intr_gate(unsigned int n, const void *addr)
-{
- BUG_ON(n < FIRST_SYSTEM_VECTOR);
- if (!test_and_set_bit(n, system_vectors))
+ if (WARN_ON(idt_setup_done))
+ return;
+
+ if (!WARN_ON(test_and_set_bit(n, system_vectors)))
set_intr_gate(n, addr);
}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index c7965ff..1810602 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -13,12 +13,14 @@
#include <linux/export.h>
#include <linux/irq.h>
+#include <asm/irq_stack.h>
#include <asm/apic.h>
#include <asm/io_apic.h>
#include <asm/irq.h>
#include <asm/mce.h>
#include <asm/hw_irq.h>
#include <asm/desc.h>
+#include <asm/traps.h>
#define CREATE_TRACE_POINTS
#include <asm/trace/irq_vectors.h>
@@ -26,9 +28,6 @@
DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
EXPORT_PER_CPU_SYMBOL(irq_stat);
-DEFINE_PER_CPU(struct pt_regs *, irq_regs);
-EXPORT_PER_CPU_SYMBOL(irq_regs);
-
atomic_t irq_err_count;
/*
@@ -224,35 +223,35 @@
return sum;
}
+static __always_inline void handle_irq(struct irq_desc *desc,
+ struct pt_regs *regs)
+{
+ if (IS_ENABLED(CONFIG_X86_64))
+ run_on_irqstack_cond(desc->handle_irq, desc, regs);
+ else
+ __handle_irq(desc, regs);
+}
/*
- * do_IRQ handles all normal device IRQ's (the special
- * SMP cross-CPU interrupts have their own specific
- * handlers).
+ * common_interrupt() handles all normal device IRQ's (the special SMP
+ * cross-CPU interrupts have their own entry points).
*/
-__visible void __irq_entry do_IRQ(struct pt_regs *regs)
+DEFINE_IDTENTRY_IRQ(common_interrupt)
{
struct pt_regs *old_regs = set_irq_regs(regs);
- struct irq_desc * desc;
- /* high bit used in ret_from_ code */
- unsigned vector = ~regs->orig_ax;
+ struct irq_desc *desc;
- entering_irq();
-
- /* entering_irq() tells RCU that we're not quiescent. Check it. */
+ /* entry code tells RCU that we're not quiescent. Check it. */
RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
desc = __this_cpu_read(vector_irq[vector]);
if (likely(!IS_ERR_OR_NULL(desc))) {
- if (IS_ENABLED(CONFIG_X86_32))
- handle_irq(desc, regs);
- else
- generic_handle_irq_desc(desc);
+ handle_irq(desc, regs);
} else {
ack_APIC_irq();
if (desc == VECTOR_UNUSED) {
- pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n",
+ pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n",
__func__, smp_processor_id(),
vector);
} else {
@@ -260,8 +259,6 @@
}
}
- exiting_irq();
-
set_irq_regs(old_regs);
}
@@ -271,17 +268,16 @@
/*
* Handler for X86_PLATFORM_IPI_VECTOR.
*/
-__visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_x86_platform_ipi)
{
struct pt_regs *old_regs = set_irq_regs(regs);
- entering_ack_irq();
+ ack_APIC_irq();
trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR);
inc_irq_stat(x86_platform_ipis);
if (x86_platform_ipi_callback)
x86_platform_ipi_callback();
trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR);
- exiting_irq();
set_irq_regs(old_regs);
}
#endif
@@ -302,41 +298,29 @@
/*
* Handler for POSTED_INTERRUPT_VECTOR.
*/
-__visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_ipi)
{
- struct pt_regs *old_regs = set_irq_regs(regs);
-
- entering_ack_irq();
+ ack_APIC_irq();
inc_irq_stat(kvm_posted_intr_ipis);
- exiting_irq();
- set_irq_regs(old_regs);
}
/*
* Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
*/
-__visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_posted_intr_wakeup_ipi)
{
- struct pt_regs *old_regs = set_irq_regs(regs);
-
- entering_ack_irq();
+ ack_APIC_irq();
inc_irq_stat(kvm_posted_intr_wakeup_ipis);
kvm_posted_intr_wakeup_handler();
- exiting_irq();
- set_irq_regs(old_regs);
}
/*
* Handler for POSTED_INTERRUPT_NESTED_VECTOR.
*/
-__visible void smp_kvm_posted_intr_nested_ipi(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_nested_ipi)
{
- struct pt_regs *old_regs = set_irq_regs(regs);
-
- entering_ack_irq();
+ ack_APIC_irq();
inc_irq_stat(kvm_posted_intr_nested_ipis);
- exiting_irq();
- set_irq_regs(old_regs);
}
#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index a759ca9..0b79efc 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -148,7 +148,7 @@
call_on_stack(__do_softirq, isp);
}
-void handle_irq(struct irq_desc *desc, struct pt_regs *regs)
+void __handle_irq(struct irq_desc *desc, struct pt_regs *regs)
{
int overflow = check_stack_overflow();
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 6b32ab0..1b4fe93 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -20,6 +20,7 @@
#include <linux/sched/task_stack.h>
#include <asm/cpu_entry_area.h>
+#include <asm/irq_stack.h>
#include <asm/io_apic.h>
#include <asm/apic.h>
@@ -70,3 +71,8 @@
return 0;
return map_irq_stack(cpu);
}
+
+void do_softirq_own_stack(void)
+{
+ run_on_irqstack_cond(__do_softirq, NULL, NULL);
+}
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
index 80bee76..890d477 100644
--- a/arch/x86/kernel/irq_work.c
+++ b/arch/x86/kernel/irq_work.c
@@ -9,18 +9,18 @@
#include <linux/irq_work.h>
#include <linux/hardirq.h>
#include <asm/apic.h>
+#include <asm/idtentry.h>
#include <asm/trace/irq_vectors.h>
#include <linux/interrupt.h>
#ifdef CONFIG_X86_LOCAL_APIC
-__visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_irq_work)
{
- ipi_entering_ack_irq();
+ ack_APIC_irq();
trace_irq_work_entry(IRQ_WORK_VECTOR);
inc_irq_stat(apic_irq_work_irqs);
irq_work_run();
trace_irq_work_exit(IRQ_WORK_VECTOR);
- exiting_irq();
}
void arch_irq_work_raise(void)
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 85de8fa..3bafe1b 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -1073,13 +1073,6 @@
int __init arch_populate_kprobe_blacklist(void)
{
- int ret;
-
- ret = kprobe_add_area_blacklist((unsigned long)__irqentry_text_start,
- (unsigned long)__irqentry_text_end);
- if (ret)
- return ret;
-
return kprobe_add_area_blacklist((unsigned long)__entry_text_start,
(unsigned long)__entry_text_end);
}
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 234f58e..321c1995 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -286,9 +286,7 @@
* stack handling and registers setup.
*/
if (((paddr >= (unsigned long)__entry_text_start) &&
- (paddr < (unsigned long)__entry_text_end)) ||
- ((paddr >= (unsigned long)__irqentry_text_start) &&
- (paddr < (unsigned long)__irqentry_text_end)))
+ (paddr < (unsigned long)__entry_text_end)))
return 0;
/* Check there is enough space for a relative jump. */
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 7e6403a..df63786 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -217,7 +217,7 @@
}
EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
-u32 kvm_read_and_reset_apf_flags(void)
+noinstr u32 kvm_read_and_reset_apf_flags(void)
{
u32 flags = 0;
@@ -229,11 +229,11 @@
return flags;
}
EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags);
-NOKPROBE_SYMBOL(kvm_read_and_reset_apf_flags);
-bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
+noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
{
u32 reason = kvm_read_and_reset_apf_flags();
+ bool rcu_exit;
switch (reason) {
case KVM_PV_REASON_PAGE_NOT_PRESENT:
@@ -243,6 +243,9 @@
return false;
}
+ rcu_exit = idtentry_enter_cond_rcu(regs);
+ instrumentation_begin();
+
/*
* If the host managed to inject an async #PF into an interrupt
* disabled region, then die hard as this is not going to end well
@@ -257,13 +260,13 @@
/* Page is swapped out by the host. */
kvm_async_pf_task_wait_schedule(token);
} else {
- rcu_irq_enter();
kvm_async_pf_task_wake(token);
- rcu_irq_exit();
}
+
+ instrumentation_end();
+ idtentry_exit_cond_rcu(regs, rcu_exit);
return true;
}
-NOKPROBE_SYMBOL(__kvm_handle_async_pf);
static void __init paravirt_ops_setup(void)
{
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index bdcc514..2de365f 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -303,7 +303,7 @@
static DEFINE_PER_CPU(bool, swallow_nmi);
static DEFINE_PER_CPU(unsigned long, last_nmi_rip);
-static void default_do_nmi(struct pt_regs *regs)
+static noinstr void default_do_nmi(struct pt_regs *regs)
{
unsigned char reason = 0;
int handled;
@@ -329,6 +329,9 @@
__this_cpu_write(last_nmi_rip, regs->ip);
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+
handled = nmi_handle(NMI_LOCAL, regs);
__this_cpu_add(nmi_stats.normal, handled);
if (handled) {
@@ -342,7 +345,7 @@
*/
if (handled > 1)
__this_cpu_write(swallow_nmi, true);
- return;
+ goto out;
}
/*
@@ -374,7 +377,7 @@
#endif
__this_cpu_add(nmi_stats.external, 1);
raw_spin_unlock(&nmi_reason_lock);
- return;
+ goto out;
}
raw_spin_unlock(&nmi_reason_lock);
@@ -412,8 +415,12 @@
__this_cpu_add(nmi_stats.swallow, 1);
else
unknown_nmi_error(reason, regs);
+
+out:
+ if (regs->flags & X86_EFLAGS_IF)
+ trace_hardirqs_on_prepare();
+ instrumentation_end();
}
-NOKPROBE_SYMBOL(default_do_nmi);
/*
* NMIs can page fault or hit breakpoints which will cause it to lose
@@ -467,44 +474,9 @@
};
static DEFINE_PER_CPU(enum nmi_states, nmi_state);
static DEFINE_PER_CPU(unsigned long, nmi_cr2);
+static DEFINE_PER_CPU(unsigned long, nmi_dr7);
-#ifdef CONFIG_X86_64
-/*
- * In x86_64, we need to handle breakpoint -> NMI -> breakpoint. Without
- * some care, the inner breakpoint will clobber the outer breakpoint's
- * stack.
- *
- * If a breakpoint is being processed, and the debug stack is being
- * used, if an NMI comes in and also hits a breakpoint, the stack
- * pointer will be set to the same fixed address as the breakpoint that
- * was interrupted, causing that stack to be corrupted. To handle this
- * case, check if the stack that was interrupted is the debug stack, and
- * if so, change the IDT so that new breakpoints will use the current
- * stack and not switch to the fixed address. On return of the NMI,
- * switch back to the original IDT.
- */
-static DEFINE_PER_CPU(int, update_debug_stack);
-
-static bool notrace is_debug_stack(unsigned long addr)
-{
- struct cea_exception_stacks *cs = __this_cpu_read(cea_exception_stacks);
- unsigned long top = CEA_ESTACK_TOP(cs, DB);
- unsigned long bot = CEA_ESTACK_BOT(cs, DB1);
-
- if (__this_cpu_read(debug_stack_usage))
- return true;
- /*
- * Note, this covers the guard page between DB and DB1 as well to
- * avoid two checks. But by all means @addr can never point into
- * the guard page.
- */
- return addr >= bot && addr < top;
-}
-NOKPROBE_SYMBOL(is_debug_stack);
-#endif
-
-dotraplinkage notrace void
-do_nmi(struct pt_regs *regs, long error_code)
+DEFINE_IDTENTRY_RAW(exc_nmi)
{
if (IS_ENABLED(CONFIG_SMP) && cpu_is_offline(smp_processor_id()))
return;
@@ -517,18 +489,7 @@
this_cpu_write(nmi_cr2, read_cr2());
nmi_restart:
-#ifdef CONFIG_X86_64
- /*
- * If we interrupted a breakpoint, it is possible that
- * the nmi handler will have breakpoints too. We need to
- * change the IDT such that breakpoints that happen here
- * continue to use the NMI stack.
- */
- if (unlikely(is_debug_stack(regs->sp))) {
- debug_stack_set_zero();
- this_cpu_write(update_debug_stack, 1);
- }
-#endif
+ this_cpu_write(nmi_dr7, local_db_save());
nmi_enter();
@@ -539,12 +500,7 @@
nmi_exit();
-#ifdef CONFIG_X86_64
- if (unlikely(this_cpu_read(update_debug_stack))) {
- debug_stack_reset();
- this_cpu_write(update_debug_stack, 0);
- }
-#endif
+ local_db_restore(this_cpu_read(nmi_dr7));
if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
write_cr2(this_cpu_read(nmi_cr2));
@@ -554,7 +510,6 @@
if (user_mode(regs))
mds_user_clear_cpu_buffers();
}
-NOKPROBE_SYMBOL(do_nmi);
void stop_nmi(void)
{
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index b8d4e9c..eff4ce3 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -27,6 +27,7 @@
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/apic.h>
+#include <asm/idtentry.h>
#include <asm/nmi.h>
#include <asm/mce.h>
#include <asm/trace/irq_vectors.h>
@@ -130,13 +131,11 @@
/*
* this function calls the 'stop' function on all other CPUs in the system.
*/
-
-asmlinkage __visible void smp_reboot_interrupt(void)
+DEFINE_IDTENTRY_SYSVEC(sysvec_reboot)
{
- ipi_entering_ack_irq();
+ ack_APIC_irq();
cpu_emergency_vmxoff();
stop_this_cpu(NULL);
- irq_exit();
}
static int register_stop_handler(void)
@@ -221,47 +220,33 @@
/*
* Reschedule call back. KVM uses this interrupt to force a cpu out of
- * guest mode
+ * guest mode.
*/
-__visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_reschedule_ipi)
{
ack_APIC_irq();
+ trace_reschedule_entry(RESCHEDULE_VECTOR);
inc_irq_stat(irq_resched_count);
- kvm_set_cpu_l1tf_flush_l1d();
-
- if (trace_resched_ipi_enabled()) {
- /*
- * scheduler_ipi() might call irq_enter() as well, but
- * nested calls are fine.
- */
- irq_enter();
- trace_reschedule_entry(RESCHEDULE_VECTOR);
- scheduler_ipi();
- trace_reschedule_exit(RESCHEDULE_VECTOR);
- irq_exit();
- return;
- }
scheduler_ipi();
+ trace_reschedule_exit(RESCHEDULE_VECTOR);
}
-__visible void __irq_entry smp_call_function_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_call_function)
{
- ipi_entering_ack_irq();
+ ack_APIC_irq();
trace_call_function_entry(CALL_FUNCTION_VECTOR);
inc_irq_stat(irq_call_count);
generic_smp_call_function_interrupt();
trace_call_function_exit(CALL_FUNCTION_VECTOR);
- exiting_irq();
}
-__visible void __irq_entry smp_call_function_single_interrupt(struct pt_regs *r)
+DEFINE_IDTENTRY_SYSVEC(sysvec_call_function_single)
{
- ipi_entering_ack_irq();
+ ack_APIC_irq();
trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
inc_irq_stat(irq_call_count);
generic_smp_call_function_single_interrupt();
trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR);
- exiting_irq();
}
static int __init nonmi_ipi_setup(char *str)
diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c
index 496748e..fcfc077 100644
--- a/arch/x86/kernel/tracepoint.c
+++ b/arch/x86/kernel/tracepoint.c
@@ -25,20 +25,3 @@
{
static_branch_dec(&trace_pagefault_key);
}
-
-#ifdef CONFIG_SMP
-
-DEFINE_STATIC_KEY_FALSE(trace_resched_ipi_key);
-
-int trace_resched_ipi_reg(void)
-{
- static_branch_inc(&trace_resched_ipi_key);
- return 0;
-}
-
-void trace_resched_ipi_unreg(void)
-{
- static_branch_dec(&trace_resched_ipi_key);
-}
-
-#endif
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 4cc5410..af75109 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -97,24 +97,6 @@
return ud == INSN_UD0 || ud == INSN_UD2;
}
-int fixup_bug(struct pt_regs *regs, int trapnr)
-{
- if (trapnr != X86_TRAP_UD)
- return 0;
-
- switch (report_bug(regs->ip, regs)) {
- case BUG_TRAP_TYPE_NONE:
- case BUG_TRAP_TYPE_BUG:
- break;
-
- case BUG_TRAP_TYPE_WARN:
- regs->ip += LEN_UD2;
- return 1;
- }
-
- return 0;
-}
-
static nokprobe_inline int
do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str,
struct pt_regs *regs, long error_code)
@@ -145,7 +127,7 @@
* process no chance to handle the signal and notice the
* kernel fault information, so that won't result in polluting
* the information about previously queued, but not yet
- * delivered, faults. See also do_general_protection below.
+ * delivered, faults. See also exc_general_protection below.
*/
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = trapnr;
@@ -190,42 +172,120 @@
{
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
- /*
- * WARN*()s end up here; fix them up before we call the
- * notifier chain.
- */
- if (!user_mode(regs) && fixup_bug(regs, trapnr))
- return;
-
if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
NOTIFY_STOP) {
cond_local_irq_enable(regs);
do_trap(trapnr, signr, str, regs, error_code, sicode, addr);
+ cond_local_irq_disable(regs);
}
}
-#define IP ((void __user *)uprobe_get_trap_addr(regs))
-#define DO_ERROR(trapnr, signr, sicode, addr, str, name) \
-dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
-{ \
- do_error_trap(regs, error_code, str, trapnr, signr, sicode, addr); \
+/*
+ * Posix requires to provide the address of the faulting instruction for
+ * SIGILL (#UD) and SIGFPE (#DE) in the si_addr member of siginfo_t.
+ *
+ * This address is usually regs->ip, but when an uprobe moved the code out
+ * of line then regs->ip points to the XOL code which would confuse
+ * anything which analyzes the fault address vs. the unmodified binary. If
+ * a trap happened in XOL code then uprobe maps regs->ip back to the
+ * original instruction address.
+ */
+static __always_inline void __user *error_get_trap_addr(struct pt_regs *regs)
+{
+ return (void __user *)uprobe_get_trap_addr(regs);
}
-DO_ERROR(X86_TRAP_DE, SIGFPE, FPE_INTDIV, IP, "divide error", divide_error)
-DO_ERROR(X86_TRAP_OF, SIGSEGV, 0, NULL, "overflow", overflow)
-DO_ERROR(X86_TRAP_UD, SIGILL, ILL_ILLOPN, IP, "invalid opcode", invalid_op)
-DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, 0, NULL, "coprocessor segment overrun", coprocessor_segment_overrun)
-DO_ERROR(X86_TRAP_TS, SIGSEGV, 0, NULL, "invalid TSS", invalid_TSS)
-DO_ERROR(X86_TRAP_NP, SIGBUS, 0, NULL, "segment not present", segment_not_present)
-DO_ERROR(X86_TRAP_SS, SIGBUS, 0, NULL, "stack segment", stack_segment)
-#undef IP
+DEFINE_IDTENTRY(exc_divide_error)
+{
+ do_error_trap(regs, 0, "divide_error", X86_TRAP_DE, SIGFPE,
+ FPE_INTDIV, error_get_trap_addr(regs));
+}
-dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code)
+DEFINE_IDTENTRY(exc_overflow)
+{
+ do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL);
+}
+
+#ifdef CONFIG_X86_F00F_BUG
+void handle_invalid_op(struct pt_regs *regs)
+#else
+static inline void handle_invalid_op(struct pt_regs *regs)
+#endif
+{
+ do_error_trap(regs, 0, "invalid opcode", X86_TRAP_UD, SIGILL,
+ ILL_ILLOPN, error_get_trap_addr(regs));
+}
+
+DEFINE_IDTENTRY_RAW(exc_invalid_op)
+{
+ bool rcu_exit;
+
+ /*
+ * Handle BUG/WARN like NMIs instead of like normal idtentries:
+ * if we bugged/warned in a bad RCU context, for example, the last
+ * thing we want is to BUG/WARN again in the idtentry code, ad
+ * infinitum.
+ */
+ if (!user_mode(regs) && is_valid_bugaddr(regs->ip)) {
+ enum bug_trap_type type;
+
+ nmi_enter();
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+ type = report_bug(regs->ip, regs);
+ if (regs->flags & X86_EFLAGS_IF)
+ trace_hardirqs_on_prepare();
+ instrumentation_end();
+ nmi_exit();
+
+ if (type == BUG_TRAP_TYPE_WARN) {
+ /* Skip the ud2. */
+ regs->ip += LEN_UD2;
+ return;
+ }
+
+ /*
+ * Else, if this was a BUG and report_bug returns or if this
+ * was just a normal #UD, we want to continue onward and
+ * crash.
+ */
+ }
+
+ rcu_exit = idtentry_enter_cond_rcu(regs);
+ instrumentation_begin();
+ handle_invalid_op(regs);
+ instrumentation_end();
+ idtentry_exit_cond_rcu(regs, rcu_exit);
+}
+
+DEFINE_IDTENTRY(exc_coproc_segment_overrun)
+{
+ do_error_trap(regs, 0, "coprocessor segment overrun",
+ X86_TRAP_OLD_MF, SIGFPE, 0, NULL);
+}
+
+DEFINE_IDTENTRY_ERRORCODE(exc_invalid_tss)
+{
+ do_error_trap(regs, error_code, "invalid TSS", X86_TRAP_TS, SIGSEGV,
+ 0, NULL);
+}
+
+DEFINE_IDTENTRY_ERRORCODE(exc_segment_not_present)
+{
+ do_error_trap(regs, error_code, "segment not present", X86_TRAP_NP,
+ SIGBUS, 0, NULL);
+}
+
+DEFINE_IDTENTRY_ERRORCODE(exc_stack_segment)
+{
+ do_error_trap(regs, error_code, "stack segment", X86_TRAP_SS, SIGBUS,
+ 0, NULL);
+}
+
+DEFINE_IDTENTRY_ERRORCODE(exc_alignment_check)
{
char *str = "alignment check";
- RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
-
if (notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_AC, SIGBUS) == NOTIFY_STOP)
return;
@@ -271,12 +331,19 @@
* from the TSS. Returning is, in principle, okay, but changes to regs will
* be lost. If, for some reason, we need to return to a context with modified
* regs, the shim code could be adjusted to synchronize the registers.
+ *
+ * The 32bit #DF shim provides CR2 already as an argument. On 64bit it needs
+ * to be read before doing anything else.
*/
-dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2)
+DEFINE_IDTENTRY_DF(exc_double_fault)
{
static const char str[] = "double fault";
struct task_struct *tsk = current;
+#ifdef CONFIG_VMAP_STACK
+ unsigned long address = read_cr2();
+#endif
+
#ifdef CONFIG_X86_ESPFIX64
extern unsigned char native_irq_return_iret[];
@@ -299,6 +366,7 @@
regs->ip == (unsigned long)native_irq_return_iret)
{
struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
+ unsigned long *p = (unsigned long *)regs->sp;
/*
* regs->sp points to the failing IRET frame on the
@@ -306,7 +374,11 @@
* in gpregs->ss through gpregs->ip.
*
*/
- memmove(&gpregs->ip, (void *)regs->sp, 5*8);
+ gpregs->ip = p[0];
+ gpregs->cs = p[1];
+ gpregs->flags = p[2];
+ gpregs->sp = p[3];
+ gpregs->ss = p[4];
gpregs->orig_ax = 0; /* Missing (lost) #GP error code */
/*
@@ -320,7 +392,7 @@
* which is what the stub expects, given that the faulting
* RIP will be the IRET instruction.
*/
- regs->ip = (unsigned long)general_protection;
+ regs->ip = (unsigned long)asm_exc_general_protection;
regs->sp = (unsigned long)&gpregs->orig_ax;
return;
@@ -328,6 +400,7 @@
#endif
nmi_enter();
+ instrumentation_begin();
notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
tsk->thread.error_code = error_code;
@@ -371,27 +444,31 @@
* stack even if the actual trigger for the double fault was
* something else.
*/
- if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE)
- handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
+ if ((unsigned long)task_stack_page(tsk) - 1 - address < PAGE_SIZE) {
+ handle_stack_overflow("kernel stack overflow (double-fault)",
+ regs, address);
+ }
#endif
pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
die("double fault", regs, error_code);
panic("Machine halted.");
+ instrumentation_end();
}
-dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
+DEFINE_IDTENTRY(exc_bounds)
{
- RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
- if (notify_die(DIE_TRAP, "bounds", regs, error_code,
+ if (notify_die(DIE_TRAP, "bounds", regs, 0,
X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
return;
cond_local_irq_enable(regs);
if (!user_mode(regs))
- die("bounds", regs, error_code);
+ die("bounds", regs, 0);
- do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL);
+ do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, 0, 0, NULL);
+
+ cond_local_irq_disable(regs);
}
enum kernel_gp_hint {
@@ -438,7 +515,7 @@
#define GPFSTR "general protection fault"
-dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code)
+DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
{
char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR;
enum kernel_gp_hint hint = GP_NO_HINT;
@@ -446,17 +523,17 @@
unsigned long gp_addr;
int ret;
- RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
cond_local_irq_enable(regs);
if (static_cpu_has(X86_FEATURE_UMIP)) {
if (user_mode(regs) && fixup_umip_exception(regs))
- return;
+ goto exit;
}
if (v8086_mode(regs)) {
local_irq_enable();
handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
+ local_irq_disable();
return;
}
@@ -468,12 +545,11 @@
show_signal(tsk, SIGSEGV, "", desc, regs, error_code);
force_sig(SIGSEGV);
-
- return;
+ goto exit;
}
if (fixup_exception(regs, X86_TRAP_GP, error_code, 0))
- return;
+ goto exit;
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_GP;
@@ -485,11 +561,11 @@
if (!preemptible() &&
kprobe_running() &&
kprobe_fault_handler(regs, X86_TRAP_GP))
- return;
+ goto exit;
ret = notify_die(DIE_GPF, desc, regs, error_code, X86_TRAP_GP, SIGSEGV);
if (ret == NOTIFY_STOP)
- return;
+ goto exit;
if (error_code)
snprintf(desc, sizeof(desc), "segment-related " GPFSTR);
@@ -511,47 +587,74 @@
die_addr(desc, regs, error_code, gp_addr);
+exit:
+ cond_local_irq_disable(regs);
}
-NOKPROBE_SYMBOL(do_general_protection);
-dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
+static bool do_int3(struct pt_regs *regs)
{
- if (poke_int3_handler(regs))
- return;
-
- /*
- * Unlike any other non-IST entry, we can be called from pretty much
- * any location in the kernel through kprobes -- text_poke() will most
- * likely be handled by poke_int3_handler() above. This means this
- * handler is effectively NMI-like.
- */
- if (!user_mode(regs))
- nmi_enter();
+ int res;
#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
- if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
- SIGTRAP) == NOTIFY_STOP)
- goto exit;
+ if (kgdb_ll_trap(DIE_INT3, "int3", regs, 0, X86_TRAP_BP,
+ SIGTRAP) == NOTIFY_STOP)
+ return true;
#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
#ifdef CONFIG_KPROBES
if (kprobe_int3_handler(regs))
- goto exit;
+ return true;
#endif
+ res = notify_die(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP);
- if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
- SIGTRAP) == NOTIFY_STOP)
- goto exit;
+ return res == NOTIFY_STOP;
+}
+
+static void do_int3_user(struct pt_regs *regs)
+{
+ if (do_int3(regs))
+ return;
cond_local_irq_enable(regs);
- do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, 0, NULL);
+ do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, 0, 0, NULL);
cond_local_irq_disable(regs);
-
-exit:
- if (!user_mode(regs))
- nmi_exit();
}
-NOKPROBE_SYMBOL(do_int3);
+
+DEFINE_IDTENTRY_RAW(exc_int3)
+{
+ /*
+ * poke_int3_handler() is completely self contained code; it does (and
+ * must) *NOT* call out to anything, lest it hits upon yet another
+ * INT3.
+ */
+ if (poke_int3_handler(regs))
+ return;
+
+ /*
+ * idtentry_enter_user() uses static_branch_{,un}likely() and therefore
+ * can trigger INT3, hence poke_int3_handler() must be done
+ * before. If the entry came from kernel mode, then use nmi_enter()
+ * because the INT3 could have been hit in any context including
+ * NMI.
+ */
+ if (user_mode(regs)) {
+ idtentry_enter_user(regs);
+ instrumentation_begin();
+ do_int3_user(regs);
+ instrumentation_end();
+ idtentry_exit_user(regs);
+ } else {
+ nmi_enter();
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+ if (!do_int3(regs))
+ die("int3", regs, 0);
+ if (regs->flags & X86_EFLAGS_IF)
+ trace_hardirqs_on_prepare();
+ instrumentation_end();
+ nmi_exit();
+ }
+}
#ifdef CONFIG_X86_64
/*
@@ -559,21 +662,20 @@
* to switch to the normal thread stack if the interrupted code was in
* user mode. The actual stack switch is done in entry_64.S
*/
-asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
+asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs)
{
struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1;
if (regs != eregs)
*regs = *eregs;
return regs;
}
-NOKPROBE_SYMBOL(sync_regs);
struct bad_iret_stack {
void *error_entry_ret;
struct pt_regs regs;
};
-asmlinkage __visible notrace
+asmlinkage __visible noinstr
struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
{
/*
@@ -584,19 +686,21 @@
* just below the IRET frame) and we want to pretend that the
* exception came from the IRET target.
*/
- struct bad_iret_stack *new_stack =
- (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
+ struct bad_iret_stack tmp, *new_stack =
+ (struct bad_iret_stack *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
- /* Copy the IRET target to the new stack. */
- memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
+ /* Copy the IRET target to the temporary storage. */
+ memcpy(&tmp.regs.ip, (void *)s->regs.sp, 5*8);
/* Copy the remainder of the stack from the current stack. */
- memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip));
+ memcpy(&tmp, s, offsetof(struct bad_iret_stack, regs.ip));
+
+ /* Update the entry stack */
+ memcpy(new_stack, &tmp, sizeof(tmp));
BUG_ON(!user_mode(&new_stack->regs));
return new_stack;
}
-NOKPROBE_SYMBOL(fixup_bad_iret);
#endif
static bool is_sysenter_singlestep(struct pt_regs *regs)
@@ -622,6 +726,43 @@
#endif
}
+static __always_inline void debug_enter(unsigned long *dr6, unsigned long *dr7)
+{
+ /*
+ * Disable breakpoints during exception handling; recursive exceptions
+ * are exceedingly 'fun'.
+ *
+ * Since this function is NOKPROBE, and that also applies to
+ * HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a
+ * HW_BREAKPOINT_W on our stack)
+ *
+ * Entry text is excluded for HW_BP_X and cpu_entry_area, which
+ * includes the entry stack is excluded for everything.
+ */
+ *dr7 = local_db_save();
+
+ /*
+ * The Intel SDM says:
+ *
+ * Certain debug exceptions may clear bits 0-3. The remaining
+ * contents of the DR6 register are never cleared by the
+ * processor. To avoid confusion in identifying debug
+ * exceptions, debug handlers should clear the register before
+ * returning to the interrupted task.
+ *
+ * Keep it simple: clear DR6 immediately.
+ */
+ get_debugreg(*dr6, 6);
+ set_debugreg(0, 6);
+ /* Filter out all the reserved bits which are preset to 1 */
+ *dr6 &= ~DR6_RESERVED;
+}
+
+static __always_inline void debug_exit(unsigned long dr7)
+{
+ local_db_restore(dr7);
+}
+
/*
* Our handling of the processor debug registers is non-trivial.
* We do not clear them on entry and exit from the kernel. Therefore
@@ -646,86 +787,54 @@
*
* May run on IST stack.
*/
-dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
+static void handle_debug(struct pt_regs *regs, unsigned long dr6, bool user)
{
struct task_struct *tsk = current;
- int user_icebp = 0;
- unsigned long dr6;
+ bool user_icebp;
int si_code;
- nmi_enter();
-
- get_debugreg(dr6, 6);
- /*
- * The Intel SDM says:
- *
- * Certain debug exceptions may clear bits 0-3. The remaining
- * contents of the DR6 register are never cleared by the
- * processor. To avoid confusion in identifying debug
- * exceptions, debug handlers should clear the register before
- * returning to the interrupted task.
- *
- * Keep it simple: clear DR6 immediately.
- */
- set_debugreg(0, 6);
-
- /* Filter out all the reserved bits which are preset to 1 */
- dr6 &= ~DR6_RESERVED;
-
/*
* The SDM says "The processor clears the BTF flag when it
* generates a debug exception." Clear TIF_BLOCKSTEP to keep
* TIF_BLOCKSTEP in sync with the hardware BTF flag.
*/
- clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
+ clear_thread_flag(TIF_BLOCKSTEP);
- if (unlikely(!user_mode(regs) && (dr6 & DR_STEP) &&
- is_sysenter_singlestep(regs))) {
- dr6 &= ~DR_STEP;
- if (!dr6)
- goto exit;
- /*
- * else we might have gotten a single-step trap and hit a
- * watchpoint at the same time, in which case we should fall
- * through and handle the watchpoint.
- */
- }
+ /*
+ * If DR6 is zero, no point in trying to handle it. The kernel is
+ * not using INT1.
+ */
+ if (!user && !dr6)
+ return;
/*
* If dr6 has no reason to give us about the origin of this trap,
* then it's very likely the result of an icebp/int01 trap.
* User wants a sigtrap for that.
*/
- if (!dr6 && user_mode(regs))
- user_icebp = 1;
+ user_icebp = user && !dr6;
/* Store the virtualized DR6 value */
tsk->thread.debugreg6 = dr6;
#ifdef CONFIG_KPROBES
- if (kprobe_debug_handler(regs))
- goto exit;
+ if (kprobe_debug_handler(regs)) {
+ return;
+ }
#endif
- if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code,
- SIGTRAP) == NOTIFY_STOP)
- goto exit;
-
- /*
- * Let others (NMI) know that the debug stack is in use
- * as we may switch to the interrupt stack.
- */
- debug_stack_usage_inc();
+ if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, 0,
+ SIGTRAP) == NOTIFY_STOP) {
+ return;
+ }
/* It's safe to allow irq's after DR6 has been saved */
cond_local_irq_enable(regs);
if (v8086_mode(regs)) {
- handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
- X86_TRAP_DB);
- cond_local_irq_disable(regs);
- debug_stack_usage_dec();
- goto exit;
+ handle_vm86_trap((struct kernel_vm86_regs *) regs, 0,
+ X86_TRAP_DB);
+ goto out;
}
if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) {
@@ -739,23 +848,91 @@
set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
regs->flags &= ~X86_EFLAGS_TF;
}
+
si_code = get_si_code(tsk->thread.debugreg6);
if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
- send_sigtrap(regs, error_code, si_code);
- cond_local_irq_disable(regs);
- debug_stack_usage_dec();
+ send_sigtrap(regs, 0, si_code);
-exit:
+out:
+ cond_local_irq_disable(regs);
+}
+
+static __always_inline void exc_debug_kernel(struct pt_regs *regs,
+ unsigned long dr6)
+{
+ nmi_enter();
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+
+ /*
+ * Catch SYSENTER with TF set and clear DR_STEP. If this hit a
+ * watchpoint at the same time then that will still be handled.
+ */
+ if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs))
+ dr6 &= ~DR_STEP;
+
+ handle_debug(regs, dr6, false);
+
+ if (regs->flags & X86_EFLAGS_IF)
+ trace_hardirqs_on_prepare();
+ instrumentation_end();
nmi_exit();
}
-NOKPROBE_SYMBOL(do_debug);
+
+static __always_inline void exc_debug_user(struct pt_regs *regs,
+ unsigned long dr6)
+{
+ idtentry_enter_user(regs);
+ instrumentation_begin();
+
+ handle_debug(regs, dr6, true);
+ instrumentation_end();
+ idtentry_exit_user(regs);
+}
+
+#ifdef CONFIG_X86_64
+/* IST stack entry */
+DEFINE_IDTENTRY_DEBUG(exc_debug)
+{
+ unsigned long dr6, dr7;
+
+ debug_enter(&dr6, &dr7);
+ exc_debug_kernel(regs, dr6);
+ debug_exit(dr7);
+}
+
+/* User entry, runs on regular task stack */
+DEFINE_IDTENTRY_DEBUG_USER(exc_debug)
+{
+ unsigned long dr6, dr7;
+
+ debug_enter(&dr6, &dr7);
+ exc_debug_user(regs, dr6);
+ debug_exit(dr7);
+}
+#else
+/* 32 bit does not have separate entry points. */
+DEFINE_IDTENTRY_DEBUG(exc_debug)
+{
+ unsigned long dr6, dr7;
+
+ debug_enter(&dr6, &dr7);
+
+ if (user_mode(regs))
+ exc_debug_user(regs, dr6);
+ else
+ exc_debug_kernel(regs, dr6);
+
+ debug_exit(dr7);
+}
+#endif
/*
* Note that we play around with the 'TS' bit in an attempt to get
* the correct behaviour even in the presence of the asynchronous
* IRQ13 behaviour
*/
-static void math_error(struct pt_regs *regs, int error_code, int trapnr)
+static void math_error(struct pt_regs *regs, int trapnr)
{
struct task_struct *task = current;
struct fpu *fpu = &task->thread.fpu;
@@ -766,16 +943,16 @@
cond_local_irq_enable(regs);
if (!user_mode(regs)) {
- if (fixup_exception(regs, trapnr, error_code, 0))
- return;
+ if (fixup_exception(regs, trapnr, 0, 0))
+ goto exit;
- task->thread.error_code = error_code;
+ task->thread.error_code = 0;
task->thread.trap_nr = trapnr;
- if (notify_die(DIE_TRAP, str, regs, error_code,
- trapnr, SIGFPE) != NOTIFY_STOP)
- die(str, regs, error_code);
- return;
+ if (notify_die(DIE_TRAP, str, regs, 0, trapnr,
+ SIGFPE) != NOTIFY_STOP)
+ die(str, regs, 0);
+ goto exit;
}
/*
@@ -784,32 +961,37 @@
fpu__save(fpu);
task->thread.trap_nr = trapnr;
- task->thread.error_code = error_code;
+ task->thread.error_code = 0;
si_code = fpu__exception_code(fpu, trapnr);
/* Retry when we get spurious exceptions: */
if (!si_code)
- return;
+ goto exit;
force_sig_fault(SIGFPE, si_code,
(void __user *)uprobe_get_trap_addr(regs));
+exit:
+ cond_local_irq_disable(regs);
}
-dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
+DEFINE_IDTENTRY(exc_coprocessor_error)
{
- RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
- math_error(regs, error_code, X86_TRAP_MF);
+ math_error(regs, X86_TRAP_MF);
}
-dotraplinkage void
-do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
+DEFINE_IDTENTRY(exc_simd_coprocessor_error)
{
- RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
- math_error(regs, error_code, X86_TRAP_XF);
+ if (IS_ENABLED(CONFIG_X86_INVD_BUG)) {
+ /* AMD 486 bug: INVD in CPL 0 raises #XF instead of #GP */
+ if (!static_cpu_has(X86_FEATURE_XMM)) {
+ __exc_general_protection(regs, 0);
+ return;
+ }
+ }
+ math_error(regs, X86_TRAP_XF);
}
-dotraplinkage void
-do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
+DEFINE_IDTENTRY(exc_spurious_interrupt_bug)
{
/*
* This addresses a Pentium Pro Erratum:
@@ -832,13 +1014,10 @@
*/
}
-dotraplinkage void
-do_device_not_available(struct pt_regs *regs, long error_code)
+DEFINE_IDTENTRY(exc_device_not_available)
{
unsigned long cr0 = read_cr0();
- RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
-
#ifdef CONFIG_MATH_EMULATION
if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) {
struct math_emu_info info = { };
@@ -847,6 +1026,8 @@
info.regs = regs;
math_emulate(&info);
+
+ cond_local_irq_disable(regs);
return;
}
#endif
@@ -861,22 +1042,20 @@
* to kill the task than getting stuck in a never-ending
* loop of #NM faults.
*/
- die("unexpected #NM exception", regs, error_code);
+ die("unexpected #NM exception", regs, 0);
}
}
-NOKPROBE_SYMBOL(do_device_not_available);
#ifdef CONFIG_X86_32
-dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
+DEFINE_IDTENTRY_SW(iret_error)
{
- RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
local_irq_enable();
-
- if (notify_die(DIE_TRAP, "iret exception", regs, error_code,
+ if (notify_die(DIE_TRAP, "iret exception", regs, 0,
X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) {
- do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
+ do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, 0,
ILL_BADSTK, (void __user *)NULL);
}
+ local_irq_disable();
}
#endif
@@ -888,20 +1067,9 @@
idt_setup_traps();
/*
- * Set the IDT descriptor to a fixed read-only location, so that the
- * "sidt" instruction will not leak the location of the kernel, and
- * to defend the IDT against arbitrary memory write vulnerabilities.
- * It will be reloaded in cpu_init() */
- cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table),
- PAGE_KERNEL_RO);
- idt_descr.address = CPU_ENTRY_AREA_RO_IDT;
-
- /*
* Should be a barrier for any external CPU state:
*/
cpu_init();
idt_setup_ist_traps();
-
- idt_setup_debugidt_traps();
}
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index 54226110..722a85f 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -74,13 +74,7 @@
{
char *addr = (char *)ip;
- if (addr >= __entry_text_start && addr < __entry_text_end)
- return true;
-
- if (addr >= __irqentry_text_start && addr < __irqentry_text_end)
- return true;
-
- return false;
+ return addr >= __entry_text_start && addr < __entry_text_end;
}
static inline unsigned long *last_frame(struct unwind_state *state)
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 7c35556..3bfc8dd 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -134,7 +134,6 @@
KPROBES_TEXT
ALIGN_ENTRY_TEXT_BEGIN
ENTRY_TEXT
- IRQENTRY_TEXT
ALIGN_ENTRY_TEXT_END
SOFTIRQENTRY_TEXT
*(.fixup)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index c8f5e87..8ccfa41 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1839,7 +1839,7 @@
.flags = X86_EFLAGS_IF,
};
- do_machine_check(®s, 0);
+ do_machine_check(®s);
#endif
}
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index adb11b5..d1af20b 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3087,9 +3087,9 @@
/*
* VMExit clears RFLAGS.IF and DR7, even on a consistency check.
*/
- local_irq_enable();
if (hw_breakpoint_active())
set_debugreg(__this_cpu_read(cpu_dr7), 7);
+ local_irq_enable();
preempt_enable();
/*
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 08e26a9..36c7717 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4709,7 +4709,7 @@
.flags = X86_EFLAGS_IF,
};
- do_machine_check(®s, 0);
+ do_machine_check(®s);
#endif
}
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index 6f8b48f..770b613 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -107,7 +107,6 @@
*/
cea_map_stack(DF);
cea_map_stack(NMI);
- cea_map_stack(DB1);
cea_map_stack(DB);
cea_map_stack(MCE);
}
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index b991aa4..1d6cb07 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -204,8 +204,19 @@
if (fixup_exception(regs, trapnr, regs->orig_ax, 0))
return;
- if (fixup_bug(regs, trapnr))
- return;
+ if (trapnr == X86_TRAP_UD) {
+ if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) {
+ /* Skip the ud2. */
+ regs->ip += LEN_UD2;
+ return;
+ }
+
+ /*
+ * If this was a BUG and report_bug returns or if this
+ * was just a normal #UD, we want to continue onward and
+ * crash.
+ */
+ }
fail:
early_printk("PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 0x%lx\n",
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 0b03ae8..66be9bd 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -414,21 +414,13 @@
return 0;
}
+/* Pentium F0 0F C7 C8 bug workaround: */
static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
{
#ifdef CONFIG_X86_F00F_BUG
- unsigned long nr;
-
- /*
- * Pentium F0 0F C7 C8 bug workaround:
- */
- if (boot_cpu_has_bug(X86_BUG_F00F)) {
- nr = (address - idt_descr.address) >> 3;
-
- if (nr == 6) {
- do_invalid_op(regs, 0);
- return 1;
- }
+ if (boot_cpu_has_bug(X86_BUG_F00F) && idt_is_f00f_address(address)) {
+ handle_invalid_op(regs);
+ return 1;
}
#endif
return 0;
@@ -786,6 +778,8 @@
force_sig_fault(SIGSEGV, si_code, (void __user *)address);
+ local_irq_disable();
+
return;
}
@@ -1355,11 +1349,38 @@
trace_page_fault_kernel(address, regs, error_code);
}
-dotraplinkage void
-do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
- unsigned long address)
+static __always_inline void
+handle_page_fault(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
{
+ trace_page_fault_entries(regs, error_code, address);
+
+ if (unlikely(kmmio_fault(regs, address)))
+ return;
+
+ /* Was the fault on kernel-controlled part of the address space? */
+ if (unlikely(fault_in_kernel_space(address))) {
+ do_kern_addr_fault(regs, error_code, address);
+ } else {
+ do_user_addr_fault(regs, error_code, address);
+ /*
+ * User address page fault handling might have reenabled
+ * interrupts. Fixing up all potential exit points of
+ * do_user_addr_fault() and its leaf functions is just not
+ * doable w/o creating an unholy mess or turning the code
+ * upside down.
+ */
+ local_irq_disable();
+ }
+}
+
+DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
+{
+ unsigned long address = read_cr2();
+ bool rcu_exit;
+
prefetchw(¤t->mm->mmap_lock);
+
/*
* KVM has two types of events that are, logically, interrupts, but
* are unfortunately delivered using the #PF vector. These events are
@@ -1374,19 +1395,28 @@
* getting values from real and async page faults mixed up.
*
* Fingers crossed.
+ *
+ * The async #PF handling code takes care of idtentry handling
+ * itself.
*/
if (kvm_handle_async_pf(regs, (u32)address))
return;
- trace_page_fault_entries(regs, hw_error_code, address);
+ /*
+ * Entry handling for valid #PF from kernel mode is slightly
+ * different: RCU is already watching and rcu_irq_enter() must not
+ * be invoked because a kernel fault on a user space address might
+ * sleep.
+ *
+ * In case the fault hit a RCU idle region the conditional entry
+ * code reenabled RCU to avoid subsequent wreckage which helps
+ * debugability.
+ */
+ rcu_exit = idtentry_enter_cond_rcu(regs);
- if (unlikely(kmmio_fault(regs, address)))
- return;
+ instrumentation_begin();
+ handle_page_fault(regs, error_code, address);
+ instrumentation_end();
- /* Was the fault on kernel-controlled part of the address space? */
- if (unlikely(fault_in_kernel_space(address)))
- do_kern_addr_fault(regs, hw_error_code, address);
- else
- do_user_addr_fault(regs, hw_error_code, address);
+ idtentry_exit_cond_rcu(regs, rcu_exit);
}
-NOKPROBE_SYMBOL(do_page_fault);
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index a3c6757..a8a924b 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -492,12 +492,12 @@
}
/*
- * Clone the populated PMDs of the entry and irqentry text and force it RO.
+ * Clone the populated PMDs of the entry text and force it RO.
*/
static void pti_clone_entry_text(void)
{
pti_clone_pgtable((unsigned long) __entry_text_start,
- (unsigned long) __irqentry_text_end,
+ (unsigned long) __entry_text_end,
PTI_CLONE_PMD);
}
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 4ea6969..0ac96ca3 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1272,7 +1272,7 @@
* (the resource will not be freed until noninterruptable cpus see this
* interrupt; hardware may timeout the s/w ack and reply ERROR)
*/
-void uv_bau_message_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_uv_bau_message)
{
int count = 0;
cycles_t time_start;
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index e138f7d..3e89b00 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -13,6 +13,7 @@
#include <asm/smp.h>
#include <asm/reboot.h>
#include <asm/setup.h>
+#include <asm/idtentry.h>
#include <asm/hypervisor.h>
#include <asm/e820/api.h>
#include <asm/early_ioremap.h>
@@ -118,6 +119,17 @@
this_cpu_write(xen_vcpu_id, smp_processor_id());
}
+DEFINE_IDTENTRY_SYSVEC(sysvec_xen_hvm_callback)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ inc_irq_stat(irq_hv_callback_count);
+
+ xen_hvm_evtchn_do_upcall();
+
+ set_irq_regs(old_regs);
+}
+
#ifdef CONFIG_KEXEC_CORE
static void xen_hvm_shutdown(void)
{
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index c2c97fa..33b309d 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -604,32 +604,42 @@
bool ist_okay;
};
+#define TRAP_ENTRY(func, ist_ok) { \
+ .orig = asm_##func, \
+ .xen = xen_asm_##func, \
+ .ist_okay = ist_ok }
+
+#define TRAP_ENTRY_REDIR(func, xenfunc, ist_ok) { \
+ .orig = asm_##func, \
+ .xen = xen_asm_##xenfunc, \
+ .ist_okay = ist_ok }
+
static struct trap_array_entry trap_array[] = {
- { debug, xen_xendebug, true },
- { double_fault, xen_double_fault, true },
+ TRAP_ENTRY_REDIR(exc_debug, exc_xendebug, true ),
+ TRAP_ENTRY(exc_double_fault, true ),
#ifdef CONFIG_X86_MCE
- { machine_check, xen_machine_check, true },
+ TRAP_ENTRY(exc_machine_check, true ),
#endif
- { nmi, xen_xennmi, true },
- { int3, xen_int3, false },
- { overflow, xen_overflow, false },
+ TRAP_ENTRY_REDIR(exc_nmi, exc_xennmi, true ),
+ TRAP_ENTRY(exc_int3, false ),
+ TRAP_ENTRY(exc_overflow, false ),
#ifdef CONFIG_IA32_EMULATION
{ entry_INT80_compat, xen_entry_INT80_compat, false },
#endif
- { page_fault, xen_page_fault, false },
- { divide_error, xen_divide_error, false },
- { bounds, xen_bounds, false },
- { invalid_op, xen_invalid_op, false },
- { device_not_available, xen_device_not_available, false },
- { coprocessor_segment_overrun, xen_coprocessor_segment_overrun, false },
- { invalid_TSS, xen_invalid_TSS, false },
- { segment_not_present, xen_segment_not_present, false },
- { stack_segment, xen_stack_segment, false },
- { general_protection, xen_general_protection, false },
- { spurious_interrupt_bug, xen_spurious_interrupt_bug, false },
- { coprocessor_error, xen_coprocessor_error, false },
- { alignment_check, xen_alignment_check, false },
- { simd_coprocessor_error, xen_simd_coprocessor_error, false },
+ TRAP_ENTRY(exc_page_fault, false ),
+ TRAP_ENTRY(exc_divide_error, false ),
+ TRAP_ENTRY(exc_bounds, false ),
+ TRAP_ENTRY(exc_invalid_op, false ),
+ TRAP_ENTRY(exc_device_not_available, false ),
+ TRAP_ENTRY(exc_coproc_segment_overrun, false ),
+ TRAP_ENTRY(exc_invalid_tss, false ),
+ TRAP_ENTRY(exc_segment_not_present, false ),
+ TRAP_ENTRY(exc_stack_segment, false ),
+ TRAP_ENTRY(exc_general_protection, false ),
+ TRAP_ENTRY(exc_spurious_interrupt_bug, false ),
+ TRAP_ENTRY(exc_coprocessor_error, false ),
+ TRAP_ENTRY(exc_alignment_check, false ),
+ TRAP_ENTRY(exc_simd_coprocessor_error, false ),
};
static bool __ref get_trap_addr(void **addr, unsigned int ist)
@@ -641,7 +651,7 @@
* Replace trap handler addresses by Xen specific ones.
* Check for known traps using IST and whitelist them.
* The debugger ones are the only ones we care about.
- * Xen will handle faults like double_fault, * so we should never see
+ * Xen will handle faults like double_fault, so we should never see
* them. Warn if there's an unexpected IST-using fault handler.
*/
for (nr = 0; nr < ARRAY_SIZE(trap_array); nr++) {
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 1a2d8a5..3566e37 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -20,6 +20,7 @@
#include <asm/setup.h>
#include <asm/acpi.h>
#include <asm/numa.h>
+#include <asm/idtentry.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
@@ -993,7 +994,8 @@
HYPERVISOR_vm_assist(VMASST_CMD_enable,
VMASST_TYPE_pae_extended_cr3);
- if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
+ if (register_callback(CALLBACKTYPE_event,
+ xen_asm_exc_xen_hypervisor_callback) ||
register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
BUG();
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index 8fa01c5..171aff1 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -26,6 +26,7 @@
#include <linux/pgtable.h>
#include <asm/paravirt.h>
+#include <asm/idtentry.h>
#include <asm/desc.h>
#include <asm/cpu.h>
@@ -348,7 +349,7 @@
ctxt->gs_base_kernel = per_cpu_offset(cpu);
#endif
ctxt->event_callback_eip =
- (unsigned long)xen_hypervisor_callback;
+ (unsigned long)xen_asm_exc_xen_hypervisor_callback;
ctxt->failsafe_callback_eip =
(unsigned long)xen_failsafe_callback;
per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
diff --git a/arch/x86/xen/suspend_hvm.c b/arch/x86/xen/suspend_hvm.c
index e666b61..9d548b0 100644
--- a/arch/x86/xen/suspend_hvm.c
+++ b/arch/x86/xen/suspend_hvm.c
@@ -2,6 +2,7 @@
#include <linux/types.h>
#include <xen/xen.h>
+#include <xen/hvm.h>
#include <xen/features.h>
#include <xen/interface/features.h>
@@ -13,6 +14,6 @@
xen_hvm_init_shared_info();
xen_vcpu_restore();
}
- xen_callback_vector();
+ xen_setup_callback_vector();
xen_unplug_emulated_devices();
}
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index 2712e91..4757cec 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -93,7 +93,7 @@
/*
* If there's something pending, mask events again so we can
- * jump back into xen_hypervisor_callback. Otherwise do not
+ * jump back into exc_xen_hypervisor_callback. Otherwise do not
* touch XEN_vcpu_info_mask.
*/
jne 1f
@@ -113,11 +113,11 @@
* Events are masked, so jumping out of the critical region is
* OK.
*/
- je xen_hypervisor_callback
+ je xen_asm_exc_xen_hypervisor_callback
1: iret
xen_iret_end_crit:
- _ASM_EXTABLE(1b, iret_exc)
+ _ASM_EXTABLE(1b, asm_iret_error)
hyper_iret:
/* put this out of line since its very rarely used */
@@ -127,7 +127,7 @@
.globl xen_iret_start_crit, xen_iret_end_crit
/*
- * This is called by xen_hypervisor_callback in entry_32.S when it sees
+ * This is called by xen_asm_exc_xen_hypervisor_callback in entry_32.S when it sees
* that the EIP at the time of interrupt was between
* xen_iret_start_crit and xen_iret_end_crit.
*
@@ -144,7 +144,7 @@
* eflags }
* cs } nested exception info
* eip }
- * return address : (into xen_hypervisor_callback)
+ * return address : (into xen_asm_exc_xen_hypervisor_callback)
*
* In order to deliver the nested exception properly, we need to discard the
* nested exception frame such that when we handle the exception, we do it
@@ -152,7 +152,8 @@
*
* The only caveat is that if the outer eax hasn't been restored yet (i.e.
* it's still on stack), we need to restore its value here.
- */
+*/
+.pushsection .noinstr.text, "ax"
SYM_CODE_START(xen_iret_crit_fixup)
/*
* Paranoia: Make sure we're really coming from kernel space.
@@ -181,3 +182,4 @@
2:
ret
SYM_CODE_END(xen_iret_crit_fixup)
+.popsection
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 0a0fd16..5d252aa 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -28,33 +28,33 @@
_ASM_NOKPROBE(xen_\name)
.endm
-xen_pv_trap divide_error
-xen_pv_trap debug
-xen_pv_trap xendebug
-xen_pv_trap int3
-xen_pv_trap xennmi
-xen_pv_trap overflow
-xen_pv_trap bounds
-xen_pv_trap invalid_op
-xen_pv_trap device_not_available
-xen_pv_trap double_fault
-xen_pv_trap coprocessor_segment_overrun
-xen_pv_trap invalid_TSS
-xen_pv_trap segment_not_present
-xen_pv_trap stack_segment
-xen_pv_trap general_protection
-xen_pv_trap page_fault
-xen_pv_trap spurious_interrupt_bug
-xen_pv_trap coprocessor_error
-xen_pv_trap alignment_check
+xen_pv_trap asm_exc_divide_error
+xen_pv_trap asm_exc_debug
+xen_pv_trap asm_exc_xendebug
+xen_pv_trap asm_exc_int3
+xen_pv_trap asm_exc_xennmi
+xen_pv_trap asm_exc_overflow
+xen_pv_trap asm_exc_bounds
+xen_pv_trap asm_exc_invalid_op
+xen_pv_trap asm_exc_device_not_available
+xen_pv_trap asm_exc_double_fault
+xen_pv_trap asm_exc_coproc_segment_overrun
+xen_pv_trap asm_exc_invalid_tss
+xen_pv_trap asm_exc_segment_not_present
+xen_pv_trap asm_exc_stack_segment
+xen_pv_trap asm_exc_general_protection
+xen_pv_trap asm_exc_page_fault
+xen_pv_trap asm_exc_spurious_interrupt_bug
+xen_pv_trap asm_exc_coprocessor_error
+xen_pv_trap asm_exc_alignment_check
#ifdef CONFIG_X86_MCE
-xen_pv_trap machine_check
+xen_pv_trap asm_exc_machine_check
#endif /* CONFIG_X86_MCE */
-xen_pv_trap simd_coprocessor_error
+xen_pv_trap asm_exc_simd_coprocessor_error
#ifdef CONFIG_IA32_EMULATION
xen_pv_trap entry_INT80_compat
#endif
-xen_pv_trap hypervisor_callback
+xen_pv_trap asm_exc_xen_hypervisor_callback
__INIT
SYM_CODE_START(xen_early_idt_handler_array)
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 45a441c..53b224f 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -8,7 +8,6 @@
#include <xen/xen-ops.h>
/* These are code, but not functions. Defined in entry.S */
-extern const char xen_hypervisor_callback[];
extern const char xen_failsafe_callback[];
void xen_sysenter_target(void);
@@ -55,7 +54,6 @@
void xen_enable_syscall(void);
void xen_vcpu_restore(void);
-void xen_callback_vector(void);
void xen_hvm_init_shared_info(void);
void xen_unplug_emulated_devices(void);
diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c
index 8596a10..f138e12 100644
--- a/drivers/acpi/acpi_extlog.c
+++ b/drivers/acpi/acpi_extlog.c
@@ -42,8 +42,6 @@
u8 rev1[12];
};
-static int old_edac_report_status;
-
static u8 extlog_dsm_uuid[] __initdata = "663E35AF-CC10-41A4-88EA-5470AF055295";
/* L1 table related physical address */
@@ -146,7 +144,7 @@
static u32 err_seq;
estatus = extlog_elog_entry_check(cpu, bank);
- if (estatus == NULL)
+ if (estatus == NULL || (mce->kflags & MCE_HANDLED_CEC))
return NOTIFY_DONE;
memcpy(elog_buf, (void *)estatus, ELOG_ENTRY_LEN);
@@ -176,7 +174,8 @@
}
out:
- return NOTIFY_STOP;
+ mce->kflags |= MCE_HANDLED_EXTLOG;
+ return NOTIFY_OK;
}
static bool __init extlog_get_l1addr(void)
@@ -228,11 +227,6 @@
if (!(cap & MCG_ELOG_P) || !extlog_get_l1addr())
return -ENODEV;
- if (edac_get_report_status() == EDAC_REPORTING_FORCE) {
- pr_warn("Not loading eMCA, error reporting force-enabled through EDAC.\n");
- return -EPERM;
- }
-
rc = -EINVAL;
/* get L1 header to fetch necessary information */
l1_hdr_size = sizeof(struct extlog_l1_head);
@@ -280,12 +274,6 @@
if (elog_buf == NULL)
goto err_release_elog;
- /*
- * eMCA event report method has higher priority than EDAC method,
- * unless EDAC event report method is mandatory.
- */
- old_edac_report_status = edac_get_report_status();
- edac_set_report_status(EDAC_REPORTING_DISABLED);
mce_register_decode_chain(&extlog_mce_dec);
/* enable OS to be involved to take over management from BIOS */
((struct extlog_l1_head *)extlog_l1_addr)->flags |= FLAG_OS_OPTIN;
@@ -307,7 +295,6 @@
static void __exit extlog_exit(void)
{
- edac_set_report_status(old_edac_report_status);
mce_unregister_decode_chain(&extlog_mce_dec);
((struct extlog_l1_head *)extlog_l1_addr)->flags &= ~FLAG_OS_OPTIN;
if (extlog_l1_addr)
diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c
index f0ae485..ee8d997 100644
--- a/drivers/acpi/nfit/mce.c
+++ b/drivers/acpi/nfit/mce.c
@@ -76,6 +76,7 @@
*/
acpi_nfit_ars_rescan(acpi_desc, 0);
}
+ mce->kflags |= MCE_HANDLED_NFIT;
break;
}
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 9cf7cc1..ef90070 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -4,9 +4,6 @@
static struct edac_pci_ctl_info *pci_ctl;
-static int report_gart_errors;
-module_param(report_gart_errors, int, 0644);
-
/*
* Set by command line parameter. If BIOS has enabled the ECC, this override is
* cleared to prevent re-enabling the hardware by this driver.
@@ -2319,6 +2316,16 @@
.dbam_to_cs = f17_addr_mask_to_cs_size,
}
},
+ [F17_M60H_CPUS] = {
+ .ctl_name = "F17h_M60h",
+ .f0_id = PCI_DEVICE_ID_AMD_17H_M60H_DF_F0,
+ .f6_id = PCI_DEVICE_ID_AMD_17H_M60H_DF_F6,
+ .max_mcs = 2,
+ .ops = {
+ .early_channel_count = f17_early_channel_count,
+ .dbam_to_cs = f17_addr_mask_to_cs_size,
+ }
+ },
[F17_M70H_CPUS] = {
.ctl_name = "F17h_M70h",
.f0_id = PCI_DEVICE_ID_AMD_17H_M70H_DF_F0,
@@ -3357,6 +3364,10 @@
fam_type = &family_types[F17_M30H_CPUS];
pvt->ops = &family_types[F17_M30H_CPUS].ops;
break;
+ } else if (pvt->model >= 0x60 && pvt->model <= 0x6f) {
+ fam_type = &family_types[F17_M60H_CPUS];
+ pvt->ops = &family_types[F17_M60H_CPUS].ops;
+ break;
} else if (pvt->model >= 0x70 && pvt->model <= 0x7f) {
fam_type = &family_types[F17_M70H_CPUS];
pvt->ops = &family_types[F17_M70H_CPUS].ops;
@@ -3681,9 +3692,6 @@
}
/* register stuff with EDAC MCE */
- if (report_gart_errors)
- amd_report_gart_errors(true);
-
if (boot_cpu_data.x86 >= 0x17)
amd_register_ecc_decoder(decode_umc_error);
else
@@ -3718,8 +3726,6 @@
edac_pci_release_generic_ctl(pci_ctl);
/* unregister from EDAC MCE */
- amd_report_gart_errors(false);
-
if (boot_cpu_data.x86 >= 0x17)
amd_unregister_ecc_decoder(decode_umc_error);
else
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index abbf3c2..52b5d03 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -120,6 +120,8 @@
#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F6 0x15ee
#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F0 0x1490
#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F6 0x1496
+#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F0 0x1448
+#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F6 0x144e
#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F0 0x1440
#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F6 0x1446
#define PCI_DEVICE_ID_AMD_19H_DF_F0 0x1650
@@ -293,6 +295,7 @@
F17_CPUS,
F17_M10H_CPUS,
F17_M30H_CPUS,
+ F17_M60H_CPUS,
F17_M70H_CPUS,
F19_CPUS,
NUM_FAMILIES,
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 75ede27..5813e93 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -43,8 +43,6 @@
int edac_op_state = EDAC_OPSTATE_INVAL;
EXPORT_SYMBOL_GPL(edac_op_state);
-static int edac_report = EDAC_REPORTING_ENABLED;
-
/* lock to memory controller's control array */
static DEFINE_MUTEX(mem_ctls_mutex);
static LIST_HEAD(mc_devices);
@@ -60,65 +58,6 @@
return container_of(e, struct mem_ctl_info, error_desc);
}
-int edac_get_report_status(void)
-{
- return edac_report;
-}
-EXPORT_SYMBOL_GPL(edac_get_report_status);
-
-void edac_set_report_status(int new)
-{
- if (new == EDAC_REPORTING_ENABLED ||
- new == EDAC_REPORTING_DISABLED ||
- new == EDAC_REPORTING_FORCE)
- edac_report = new;
-}
-EXPORT_SYMBOL_GPL(edac_set_report_status);
-
-static int edac_report_set(const char *str, const struct kernel_param *kp)
-{
- if (!str)
- return -EINVAL;
-
- if (!strncmp(str, "on", 2))
- edac_report = EDAC_REPORTING_ENABLED;
- else if (!strncmp(str, "off", 3))
- edac_report = EDAC_REPORTING_DISABLED;
- else if (!strncmp(str, "force", 5))
- edac_report = EDAC_REPORTING_FORCE;
-
- return 0;
-}
-
-static int edac_report_get(char *buffer, const struct kernel_param *kp)
-{
- int ret = 0;
-
- switch (edac_report) {
- case EDAC_REPORTING_ENABLED:
- ret = sprintf(buffer, "on");
- break;
- case EDAC_REPORTING_DISABLED:
- ret = sprintf(buffer, "off");
- break;
- case EDAC_REPORTING_FORCE:
- ret = sprintf(buffer, "force");
- break;
- default:
- ret = -EINVAL;
- break;
- }
-
- return ret;
-}
-
-static const struct kernel_param_ops edac_report_ops = {
- .set = edac_report_set,
- .get = edac_report_get,
-};
-
-module_param_cb(edac_report, &edac_report_ops, &edac_report, 0644);
-
unsigned int edac_dimm_info_location(struct dimm_info *dimm, char *buf,
unsigned int len)
{
diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c
index b3135b2..5860ca4 100644
--- a/drivers/edac/i7core_edac.c
+++ b/drivers/edac/i7core_edac.c
@@ -1815,7 +1815,7 @@
struct mem_ctl_info *mci;
i7_dev = get_i7core_dev(mce->socketid);
- if (!i7_dev)
+ if (!i7_dev || (mce->kflags & MCE_HANDLED_CEC))
return NOTIFY_DONE;
mci = i7_dev->mci;
@@ -1834,7 +1834,8 @@
i7core_check_error(mci, mce);
/* Advise mcelog that the errors were handled */
- return NOTIFY_STOP;
+ mce->kflags |= MCE_HANDLED_EDAC;
+ return NOTIFY_OK;
}
static struct notifier_block i7_mce_dec = {
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index 8874b77..2b5401d 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -10,15 +10,8 @@
static u8 xec_mask = 0xf;
-static bool report_gart_errors;
static void (*decode_dram_ecc)(int node_id, struct mce *m);
-void amd_report_gart_errors(bool v)
-{
- report_gart_errors = v;
-}
-EXPORT_SYMBOL_GPL(amd_report_gart_errors);
-
void amd_register_ecc_decoder(void (*f)(int, struct mce *))
{
decode_dram_ecc = f;
@@ -1030,20 +1023,6 @@
pr_cont("\n");
}
-/*
- * Filter out unwanted MCE signatures here.
- */
-static bool ignore_mce(struct mce *m)
-{
- /*
- * NB GART TLB error reporting is disabled by default.
- */
- if (m->bank == 4 && XEC(m->status, 0x1f) == 0x5 && !report_gart_errors)
- return true;
-
- return false;
-}
-
static const char *decode_error_status(struct mce *m)
{
if (m->status & MCI_STATUS_UC) {
@@ -1067,8 +1046,8 @@
unsigned int fam = x86_family(m->cpuid);
int ecc;
- if (ignore_mce(m))
- return NOTIFY_STOP;
+ if (m->kflags & MCE_HANDLED_CEC)
+ return NOTIFY_DONE;
pr_emerg(HW_ERR "%s\n", decode_error_status(m));
@@ -1170,7 +1149,8 @@
err_code:
amd_decode_err_code(m->status & 0xffff);
- return NOTIFY_STOP;
+ m->kflags |= MCE_HANDLED_EDAC;
+ return NOTIFY_OK;
}
static struct notifier_block amd_mce_dec_nb = {
diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h
index 4e9c5e5..4811b18 100644
--- a/drivers/edac/mce_amd.h
+++ b/drivers/edac/mce_amd.h
@@ -7,7 +7,6 @@
#include <asm/mce.h>
#define EC(x) ((x) & 0xffff)
-#define XEC(x, mask) (((x) >> 16) & mask)
#define LOW_SYNDROME(x) (((x) >> 15) & 0xff)
#define HIGH_SYNDROME(x) (((x) >> 24) & 0xff)
@@ -77,7 +76,6 @@
bool (*mc2_mce)(u16, u8);
};
-void amd_report_gart_errors(bool);
void amd_register_ecc_decoder(void (*f)(int, struct mce *));
void amd_unregister_ecc_decoder(void (*f)(int, struct mce *));
diff --git a/drivers/edac/pnd2_edac.c b/drivers/edac/pnd2_edac.c
index bc47328..c1f2e6d 100644
--- a/drivers/edac/pnd2_edac.c
+++ b/drivers/edac/pnd2_edac.c
@@ -1396,11 +1396,8 @@
struct dram_addr daddr;
char *type;
- if (edac_get_report_status() == EDAC_REPORTING_DISABLED)
- return NOTIFY_DONE;
-
mci = pnd2_mci;
- if (!mci)
+ if (!mci || (mce->kflags & MCE_HANDLED_CEC))
return NOTIFY_DONE;
/*
@@ -1429,7 +1426,8 @@
pnd2_mce_output_error(mci, mce, &daddr);
/* Advice mcelog that the error were handled */
- return NOTIFY_STOP;
+ mce->kflags |= MCE_HANDLED_EDAC;
+ return NOTIFY_OK;
}
static struct notifier_block pnd2_mce_dec = {
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
index 7d51c82..d414698 100644
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -3134,7 +3134,7 @@
struct mem_ctl_info *mci;
char *type;
- if (edac_get_report_status() == EDAC_REPORTING_DISABLED)
+ if (mce->kflags & MCE_HANDLED_CEC)
return NOTIFY_DONE;
/*
@@ -3183,7 +3183,8 @@
sbridge_mce_output_error(mci, mce);
/* Advice mcelog that the error were handled */
- return NOTIFY_STOP;
+ mce->kflags |= MCE_HANDLED_EDAC;
+ return NOTIFY_OK;
}
static struct notifier_block sbridge_mce_dec = {
@@ -3523,8 +3524,6 @@
if (rc >= 0) {
mce_register_decode_chain(&sbridge_mce_dec);
- if (edac_get_report_status() == EDAC_REPORTING_DISABLED)
- sbridge_printk(KERN_WARNING, "Loading driver, error reporting disabled.\n");
return 0;
}
diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 46be1a7..6d8d6dc 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -573,7 +573,7 @@
struct mem_ctl_info *mci;
char *type;
- if (edac_get_report_status() == EDAC_REPORTING_DISABLED)
+ if (mce->kflags & MCE_HANDLED_CEC)
return NOTIFY_DONE;
/* ignore unless this is memory related with an address */
@@ -615,6 +615,7 @@
skx_mce_output_error(mci, mce, &res);
+ mce->kflags |= MCE_HANDLED_EDAC;
return NOTIFY_DONE;
}
diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c
index 9915578..8f12995 100644
--- a/drivers/hwmon/k10temp.c
+++ b/drivers/hwmon/k10temp.c
@@ -632,6 +632,7 @@
{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) },
{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) },
{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) },
+ { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F3) },
{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) },
{ PCI_VDEVICE(HYGON, PCI_DEVICE_ID_AMD_17H_DF_F3) },
{}
diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index c09cf55..569d9ad 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -309,7 +309,7 @@
return ret;
}
-int cec_add_elem(u64 pfn)
+static int cec_add_elem(u64 pfn)
{
struct ce_array *ca = &ce_arr;
unsigned int to = 0;
@@ -527,7 +527,33 @@
return 1;
}
-void __init cec_init(void)
+static int cec_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (!m)
+ return NOTIFY_DONE;
+
+ /* We eat only correctable DRAM errors with usable addresses. */
+ if (mce_is_memory_error(m) &&
+ mce_is_correctable(m) &&
+ mce_usable_address(m)) {
+ if (!cec_add_elem(m->addr >> PAGE_SHIFT)) {
+ m->kflags |= MCE_HANDLED_CEC;
+ return NOTIFY_OK;
+ }
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block cec_nb = {
+ .notifier_call = cec_notifier,
+ .priority = MCE_PRIO_CEC,
+};
+
+static void __init cec_init(void)
{
if (ce_arr.disabled)
return;
@@ -546,8 +572,11 @@
INIT_DELAYED_WORK(&cec_work, cec_work_fn);
schedule_delayed_work(&cec_work, CEC_DECAY_DEFAULT_INTERVAL);
+ mce_register_decode_chain(&cec_nb);
+
pr_info("Correctable Errors collector initialized.\n");
}
+late_initcall(cec_init);
int __init parse_cec_param(char *str)
{
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 0c4efa6..0d322f3 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
-obj-y += grant-table.o features.o balloon.o manage.o preempt.o time.o
+obj-y += grant-table.o features.o balloon.o manage.o time.o
obj-y += mem-reservation.o
obj-y += events/
obj-y += xenbus/
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 3a791c8..140c7bf 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -37,6 +37,7 @@
#ifdef CONFIG_X86
#include <asm/desc.h>
#include <asm/ptrace.h>
+#include <asm/idtentry.h>
#include <asm/irq.h>
#include <asm/io_apic.h>
#include <asm/i8259.h>
@@ -1236,9 +1237,6 @@
struct pt_regs *old_regs = set_irq_regs(regs);
irq_enter();
-#ifdef CONFIG_X86
- inc_irq_stat(irq_hv_callback_count);
-#endif
__xen_evtchn_do_upcall();
@@ -1639,26 +1637,30 @@
/* Vector callbacks are better than PCI interrupts to receive event
* channel notifications because we can receive vector callbacks on any
* vcpu and we don't need PCI support or APIC interactions. */
-void xen_callback_vector(void)
+void xen_setup_callback_vector(void)
{
- int rc;
uint64_t callback_via;
if (xen_have_vector_callback) {
callback_via = HVM_CALLBACK_VECTOR(HYPERVISOR_CALLBACK_VECTOR);
- rc = xen_set_callback_via(callback_via);
- if (rc) {
+ if (xen_set_callback_via(callback_via)) {
pr_err("Request for Xen HVM callback vector failed\n");
xen_have_vector_callback = 0;
- return;
}
- pr_info_once("Xen HVM callback vector for event delivery is enabled\n");
- alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR,
- xen_hvm_callback_vector);
}
}
+
+static __init void xen_alloc_callback_vector(void)
+{
+ if (!xen_have_vector_callback)
+ return;
+
+ pr_info("Xen HVM callback vector for event delivery is enabled\n");
+ alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_xen_hvm_callback);
+}
#else
-void xen_callback_vector(void) {}
+void xen_setup_callback_vector(void) {}
+static inline void xen_alloc_callback_vector(void) {}
#endif
#undef MODULE_PARAM_PREFIX
@@ -1692,8 +1694,10 @@
if (xen_initial_domain())
pci_xen_initial_domain();
}
- if (xen_feature(XENFEAT_hvm_callback_vector))
- xen_callback_vector();
+ if (xen_feature(XENFEAT_hvm_callback_vector)) {
+ xen_setup_callback_vector();
+ xen_alloc_callback_vector();
+ }
if (xen_hvm_domain()) {
native_init_IRQ();
diff --git a/drivers/xen/preempt.c b/drivers/xen/preempt.c
deleted file mode 100644
index 17240c5..0000000
--- a/drivers/xen/preempt.c
+++ /dev/null
@@ -1,42 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Preemptible hypercalls
- *
- * Copyright (C) 2014 Citrix Systems R&D ltd.
- */
-
-#include <linux/sched.h>
-#include <xen/xen-ops.h>
-
-#ifndef CONFIG_PREEMPTION
-
-/*
- * Some hypercalls issued by the toolstack can take many 10s of
- * seconds. Allow tasks running hypercalls via the privcmd driver to
- * be voluntarily preempted even if full kernel preemption is
- * disabled.
- *
- * Such preemptible hypercalls are bracketed by
- * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
- * calls.
- */
-
-DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
-EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
-
-asmlinkage __visible void xen_maybe_preempt_hcall(void)
-{
- if (unlikely(__this_cpu_read(xen_in_preemptible_hcall)
- && need_resched())) {
- /*
- * Clear flag as we may be rescheduled on a different
- * cpu.
- */
- __this_cpu_write(xen_in_preemptible_hcall, false);
- local_irq_enable();
- cond_resched();
- local_irq_disable();
- __this_cpu_write(xen_in_preemptible_hcall, true);
- }
-}
-#endif /* CONFIG_PREEMPTION */
diff --git a/fs/pipe.c b/fs/pipe.c
index c7c4fb5..60dbee4 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -24,6 +24,7 @@
#include <linux/syscalls.h>
#include <linux/fcntl.h>
#include <linux/memcontrol.h>
+#include <linux/watch_queue.h>
#include <linux/uaccess.h>
#include <asm/ioctls.h>
@@ -259,14 +260,44 @@
unsigned int tail = pipe->tail;
unsigned int mask = pipe->ring_size - 1;
+#ifdef CONFIG_WATCH_QUEUE
+ if (pipe->note_loss) {
+ struct watch_notification n;
+
+ if (total_len < 8) {
+ if (ret == 0)
+ ret = -ENOBUFS;
+ break;
+ }
+
+ n.type = WATCH_TYPE_META;
+ n.subtype = WATCH_META_LOSS_NOTIFICATION;
+ n.info = watch_sizeof(n);
+ if (copy_to_iter(&n, sizeof(n), to) != sizeof(n)) {
+ if (ret == 0)
+ ret = -EFAULT;
+ break;
+ }
+ ret += sizeof(n);
+ total_len -= sizeof(n);
+ pipe->note_loss = false;
+ }
+#endif
+
if (!pipe_empty(head, tail)) {
struct pipe_buffer *buf = &pipe->bufs[tail & mask];
size_t chars = buf->len;
size_t written;
int error;
- if (chars > total_len)
+ if (chars > total_len) {
+ if (buf->flags & PIPE_BUF_FLAG_WHOLE) {
+ if (ret == 0)
+ ret = -ENOBUFS;
+ break;
+ }
chars = total_len;
+ }
error = pipe_buf_confirm(pipe, buf);
if (error) {
@@ -294,6 +325,10 @@
if (!buf->len) {
pipe_buf_release(pipe, buf);
spin_lock_irq(&pipe->rd_wait.lock);
+#ifdef CONFIG_WATCH_QUEUE
+ if (buf->flags & PIPE_BUF_FLAG_LOSS)
+ pipe->note_loss = true;
+#endif
tail++;
pipe->tail = tail;
spin_unlock_irq(&pipe->rd_wait.lock);
@@ -405,6 +440,13 @@
goto out;
}
+#ifdef CONFIG_WATCH_QUEUE
+ if (pipe->watch_queue) {
+ ret = -EXDEV;
+ goto out;
+ }
+#endif
+
/*
* Only wake up if the pipe started out empty, since
* otherwise there should be no readers waiting.
@@ -574,22 +616,37 @@
int count, head, tail, mask;
switch (cmd) {
- case FIONREAD:
- __pipe_lock(pipe);
- count = 0;
- head = pipe->head;
- tail = pipe->tail;
- mask = pipe->ring_size - 1;
+ case FIONREAD:
+ __pipe_lock(pipe);
+ count = 0;
+ head = pipe->head;
+ tail = pipe->tail;
+ mask = pipe->ring_size - 1;
- while (tail != head) {
- count += pipe->bufs[tail & mask].len;
- tail++;
- }
- __pipe_unlock(pipe);
+ while (tail != head) {
+ count += pipe->bufs[tail & mask].len;
+ tail++;
+ }
+ __pipe_unlock(pipe);
- return put_user(count, (int __user *)arg);
- default:
- return -ENOIOCTLCMD;
+ return put_user(count, (int __user *)arg);
+
+#ifdef CONFIG_WATCH_QUEUE
+ case IOC_WATCH_QUEUE_SET_SIZE: {
+ int ret;
+ __pipe_lock(pipe);
+ ret = watch_queue_set_size(pipe, arg);
+ __pipe_unlock(pipe);
+ return ret;
+ }
+
+ case IOC_WATCH_QUEUE_SET_FILTER:
+ return watch_queue_set_filter(
+ pipe, (struct watch_notification_filter __user *)arg);
+#endif
+
+ default:
+ return -ENOIOCTLCMD;
}
}
@@ -700,27 +757,27 @@
return retval;
}
-static unsigned long account_pipe_buffers(struct user_struct *user,
- unsigned long old, unsigned long new)
+unsigned long account_pipe_buffers(struct user_struct *user,
+ unsigned long old, unsigned long new)
{
return atomic_long_add_return(new - old, &user->pipe_bufs);
}
-static bool too_many_pipe_buffers_soft(unsigned long user_bufs)
+bool too_many_pipe_buffers_soft(unsigned long user_bufs)
{
unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft);
return soft_limit && user_bufs > soft_limit;
}
-static bool too_many_pipe_buffers_hard(unsigned long user_bufs)
+bool too_many_pipe_buffers_hard(unsigned long user_bufs)
{
unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard);
return hard_limit && user_bufs > hard_limit;
}
-static bool is_unprivileged_user(void)
+bool pipe_is_unprivileged_user(void)
{
return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
}
@@ -742,12 +799,12 @@
user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
- if (too_many_pipe_buffers_soft(user_bufs) && is_unprivileged_user()) {
+ if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) {
user_bufs = account_pipe_buffers(user, pipe_bufs, 1);
pipe_bufs = 1;
}
- if (too_many_pipe_buffers_hard(user_bufs) && is_unprivileged_user())
+ if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
goto out_revert_acct;
pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
@@ -759,6 +816,7 @@
pipe->r_counter = pipe->w_counter = 1;
pipe->max_usage = pipe_bufs;
pipe->ring_size = pipe_bufs;
+ pipe->nr_accounted = pipe_bufs;
pipe->user = user;
mutex_init(&pipe->mutex);
return pipe;
@@ -776,7 +834,14 @@
{
int i;
- (void) account_pipe_buffers(pipe->user, pipe->ring_size, 0);
+#ifdef CONFIG_WATCH_QUEUE
+ if (pipe->watch_queue) {
+ watch_queue_clear(pipe->watch_queue);
+ put_watch_queue(pipe->watch_queue);
+ }
+#endif
+
+ (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0);
free_uid(pipe->user);
for (i = 0; i < pipe->ring_size; i++) {
struct pipe_buffer *buf = pipe->bufs + i;
@@ -852,6 +917,17 @@
if (!inode)
return -ENFILE;
+ if (flags & O_NOTIFICATION_PIPE) {
+#ifdef CONFIG_WATCH_QUEUE
+ if (watch_queue_init(inode->i_pipe) < 0) {
+ iput(inode);
+ return -ENOMEM;
+ }
+#else
+ return -ENOPKG;
+#endif
+ }
+
f = alloc_file_pseudo(inode, pipe_mnt, "",
O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)),
&pipefifo_fops);
@@ -882,7 +958,7 @@
int error;
int fdw, fdr;
- if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT))
+ if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE))
return -EINVAL;
error = create_pipe_files(files, flags);
@@ -1130,42 +1206,12 @@
}
/*
- * Allocate a new array of pipe buffers and copy the info over. Returns the
- * pipe size if successful, or return -ERROR on error.
+ * Resize the pipe ring to a number of slots.
*/
-static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
+int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
{
struct pipe_buffer *bufs;
- unsigned int size, nr_slots, head, tail, mask, n;
- unsigned long user_bufs;
- long ret = 0;
-
- size = round_pipe_size(arg);
- nr_slots = size >> PAGE_SHIFT;
-
- if (!nr_slots)
- return -EINVAL;
-
- /*
- * If trying to increase the pipe capacity, check that an
- * unprivileged user is not trying to exceed various limits
- * (soft limit check here, hard limit check just below).
- * Decreasing the pipe capacity is always permitted, even
- * if the user is currently over a limit.
- */
- if (nr_slots > pipe->ring_size &&
- size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
- return -EPERM;
-
- user_bufs = account_pipe_buffers(pipe->user, pipe->ring_size, nr_slots);
-
- if (nr_slots > pipe->ring_size &&
- (too_many_pipe_buffers_hard(user_bufs) ||
- too_many_pipe_buffers_soft(user_bufs)) &&
- is_unprivileged_user()) {
- ret = -EPERM;
- goto out_revert_acct;
- }
+ unsigned int head, tail, mask, n;
/*
* We can shrink the pipe, if arg is greater than the ring occupancy.
@@ -1177,17 +1223,13 @@
head = pipe->head;
tail = pipe->tail;
n = pipe_occupancy(pipe->head, pipe->tail);
- if (nr_slots < n) {
- ret = -EBUSY;
- goto out_revert_acct;
- }
+ if (nr_slots < n)
+ return -EBUSY;
bufs = kcalloc(nr_slots, sizeof(*bufs),
GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
- if (unlikely(!bufs)) {
- ret = -ENOMEM;
- goto out_revert_acct;
- }
+ if (unlikely(!bufs))
+ return -ENOMEM;
/*
* The pipe array wraps around, so just start the new one at zero
@@ -1215,16 +1257,68 @@
kfree(pipe->bufs);
pipe->bufs = bufs;
pipe->ring_size = nr_slots;
- pipe->max_usage = nr_slots;
+ if (pipe->max_usage > nr_slots)
+ pipe->max_usage = nr_slots;
pipe->tail = tail;
pipe->head = head;
/* This might have made more room for writers */
wake_up_interruptible(&pipe->wr_wait);
+ return 0;
+}
+
+/*
+ * Allocate a new array of pipe buffers and copy the info over. Returns the
+ * pipe size if successful, or return -ERROR on error.
+ */
+static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
+{
+ unsigned long user_bufs;
+ unsigned int nr_slots, size;
+ long ret = 0;
+
+#ifdef CONFIG_WATCH_QUEUE
+ if (pipe->watch_queue)
+ return -EBUSY;
+#endif
+
+ size = round_pipe_size(arg);
+ nr_slots = size >> PAGE_SHIFT;
+
+ if (!nr_slots)
+ return -EINVAL;
+
+ /*
+ * If trying to increase the pipe capacity, check that an
+ * unprivileged user is not trying to exceed various limits
+ * (soft limit check here, hard limit check just below).
+ * Decreasing the pipe capacity is always permitted, even
+ * if the user is currently over a limit.
+ */
+ if (nr_slots > pipe->max_usage &&
+ size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_slots);
+
+ if (nr_slots > pipe->max_usage &&
+ (too_many_pipe_buffers_hard(user_bufs) ||
+ too_many_pipe_buffers_soft(user_bufs)) &&
+ pipe_is_unprivileged_user()) {
+ ret = -EPERM;
+ goto out_revert_acct;
+ }
+
+ ret = pipe_resize_ring(pipe, nr_slots);
+ if (ret < 0)
+ goto out_revert_acct;
+
+ pipe->max_usage = nr_slots;
+ pipe->nr_accounted = nr_slots;
return pipe->max_usage * PAGE_SIZE;
out_revert_acct:
- (void) account_pipe_buffers(pipe->user, nr_slots, pipe->ring_size);
+ (void) account_pipe_buffers(pipe->user, nr_slots, pipe->nr_accounted);
return ret;
}
@@ -1233,9 +1327,17 @@
* location, so checking ->i_pipe is not enough to verify that this is a
* pipe.
*/
-struct pipe_inode_info *get_pipe_info(struct file *file)
+struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
{
- return file->f_op == &pipefifo_fops ? file->private_data : NULL;
+ struct pipe_inode_info *pipe = file->private_data;
+
+ if (file->f_op != &pipefifo_fops || !pipe)
+ return NULL;
+#ifdef CONFIG_WATCH_QUEUE
+ if (for_splice && pipe->watch_queue)
+ return NULL;
+#endif
+ return pipe;
}
long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -1243,7 +1345,7 @@
struct pipe_inode_info *pipe;
long ret;
- pipe = get_pipe_info(file);
+ pipe = get_pipe_info(file, false);
if (!pipe)
return -EBADF;
diff --git a/fs/splice.c b/fs/splice.c
index 6b3c9a0..d7c8a7c 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1101,8 +1101,8 @@
!(out->f_mode & FMODE_WRITE)))
return -EBADF;
- ipipe = get_pipe_info(in);
- opipe = get_pipe_info(out);
+ ipipe = get_pipe_info(in, true);
+ opipe = get_pipe_info(out, true);
if (ipipe && opipe) {
if (off_in || off_out)
@@ -1252,7 +1252,7 @@
static long vmsplice_to_user(struct file *file, struct iov_iter *iter,
unsigned int flags)
{
- struct pipe_inode_info *pipe = get_pipe_info(file);
+ struct pipe_inode_info *pipe = get_pipe_info(file, true);
struct splice_desc sd = {
.total_len = iov_iter_count(iter),
.flags = flags,
@@ -1287,7 +1287,7 @@
if (flags & SPLICE_F_GIFT)
buf_flag = PIPE_BUF_FLAG_GIFT;
- pipe = get_pipe_info(file);
+ pipe = get_pipe_info(file, true);
if (!pipe)
return -EBADF;
@@ -1733,8 +1733,8 @@
*/
long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags)
{
- struct pipe_inode_info *ipipe = get_pipe_info(in);
- struct pipe_inode_info *opipe = get_pipe_info(out);
+ struct pipe_inode_info *ipipe = get_pipe_info(in, true);
+ struct pipe_inode_info *opipe = get_pipe_info(out, true);
int ret = -EINVAL;
if (unlikely(!(in->f_mode & FMODE_READ) ||
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 384b5c8..c94e33a 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -83,14 +83,19 @@
void warn_slowpath_fmt(const char *file, const int line, unsigned taint,
const char *fmt, ...);
#define __WARN() __WARN_printf(TAINT_WARN, NULL)
-#define __WARN_printf(taint, arg...) \
- warn_slowpath_fmt(__FILE__, __LINE__, taint, arg)
+#define __WARN_printf(taint, arg...) do { \
+ instrumentation_begin(); \
+ warn_slowpath_fmt(__FILE__, __LINE__, taint, arg); \
+ instrumentation_end(); \
+ } while (0)
#else
extern __printf(1, 2) void __warn_printk(const char *fmt, ...);
#define __WARN() __WARN_FLAGS(BUGFLAG_TAINT(TAINT_WARN))
#define __WARN_printf(taint, arg...) do { \
+ instrumentation_begin(); \
__warn_printk(arg); \
__WARN_FLAGS(BUGFLAG_NO_CUT_HERE | BUGFLAG_TAINT(taint));\
+ instrumentation_end(); \
} while (0)
#define WARN_ON_ONCE(condition) ({ \
int __ret_warn_on = !!(condition); \
diff --git a/include/linux/bsearch.h b/include/linux/bsearch.h
index 8ed53d7..e66b711 100644
--- a/include/linux/bsearch.h
+++ b/include/linux/bsearch.h
@@ -4,7 +4,29 @@
#include <linux/types.h>
-void *bsearch(const void *key, const void *base, size_t num, size_t size,
- cmp_func_t cmp);
+static __always_inline
+void *__inline_bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp)
+{
+ const char *pivot;
+ int result;
+
+ while (num > 0) {
+ pivot = base + (num >> 1) * size;
+ result = cmp(key, pivot);
+
+ if (result == 0)
+ return (void *)pivot;
+
+ if (result > 0) {
+ base = pivot + size;
+ num--;
+ }
+ num >>= 1;
+ }
+
+ return NULL;
+}
+
+extern void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp);
#endif /* _LINUX_BSEARCH_H */
diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index 8cac62e..981b880 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -33,13 +33,13 @@
}
/* Called with interrupts disabled. */
-static inline void user_enter_irqoff(void)
+static __always_inline void user_enter_irqoff(void)
{
if (context_tracking_enabled())
__context_tracking_enter(CONTEXT_USER);
}
-static inline void user_exit_irqoff(void)
+static __always_inline void user_exit_irqoff(void)
{
if (context_tracking_enabled())
__context_tracking_exit(CONTEXT_USER);
@@ -75,7 +75,7 @@
* is enabled. If context tracking is disabled, returns
* CONTEXT_DISABLED. This should be used primarily for debugging.
*/
-static inline enum ctx_state ct_state(void)
+static __always_inline enum ctx_state ct_state(void)
{
return context_tracking_enabled() ?
this_cpu_read(context_tracking.state) : CONTEXT_DISABLED;
diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index e7fe667..65a60d3 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -26,12 +26,12 @@
extern struct static_key_false context_tracking_key;
DECLARE_PER_CPU(struct context_tracking, context_tracking);
-static inline bool context_tracking_enabled(void)
+static __always_inline bool context_tracking_enabled(void)
{
return static_branch_unlikely(&context_tracking_key);
}
-static inline bool context_tracking_enabled_cpu(int cpu)
+static __always_inline bool context_tracking_enabled_cpu(int cpu)
{
return context_tracking_enabled() && per_cpu(context_tracking.active, cpu);
}
@@ -41,7 +41,7 @@
return context_tracking_enabled() && __this_cpu_read(context_tracking.active);
}
-static inline bool context_tracking_in_user(void)
+static __always_inline bool context_tracking_in_user(void)
{
return __this_cpu_read(context_tracking.state) == CONTEXT_USER;
}
diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h
index 257ab3c..e7e45f0 100644
--- a/include/linux/debug_locks.h
+++ b/include/linux/debug_locks.h
@@ -12,7 +12,7 @@
extern int debug_locks_silent __read_mostly;
-static inline int __debug_locks_off(void)
+static __always_inline int __debug_locks_off(void)
{
return xchg(&debug_locks, 0);
}
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 0f20b98..6eb7d55 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -31,14 +31,6 @@
extern int edac_op_state;
struct bus_type *edac_get_sysfs_subsys(void);
-int edac_get_report_status(void);
-void edac_set_report_status(int new);
-
-enum {
- EDAC_REPORTING_ENABLED,
- EDAC_REPORTING_DISABLED,
- EDAC_REPORTING_FORCE
-};
static inline void opstate_init(void)
{
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index e07cf85..03c9fec 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -38,9 +38,24 @@
} while (0)
/*
+ * Like __irq_enter() without time accounting for fast
+ * interrupts, e.g. reschedule IPI where time accounting
+ * is more expensive than the actual interrupt.
+ */
+#define __irq_enter_raw() \
+ do { \
+ preempt_count_add(HARDIRQ_OFFSET); \
+ lockdep_hardirq_enter(); \
+ } while (0)
+
+/*
* Enter irq context (on NO_HZ, update jiffies):
*/
-extern void irq_enter(void);
+void irq_enter(void);
+/*
+ * Like irq_enter(), but RCU is already watching.
+ */
+void irq_enter_rcu(void);
/*
* Exit irq context without processing softirqs:
@@ -53,9 +68,23 @@
} while (0)
/*
+ * Like __irq_exit() without time accounting
+ */
+#define __irq_exit_raw() \
+ do { \
+ lockdep_hardirq_exit(); \
+ preempt_count_sub(HARDIRQ_OFFSET); \
+ } while (0)
+
+/*
* Exit irq context and process softirqs if needed:
*/
-extern void irq_exit(void);
+void irq_exit(void);
+
+/*
+ * Like irq_exit(), but return with RCU watching.
+ */
+void irq_exit_rcu(void);
#ifndef arch_nmi_enter
#define arch_nmi_enter() do { } while (0)
@@ -87,20 +116,24 @@
arch_nmi_enter(); \
printk_nmi_enter(); \
lockdep_off(); \
- ftrace_nmi_enter(); \
BUG_ON(in_nmi() == NMI_MASK); \
__preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \
rcu_nmi_enter(); \
lockdep_hardirq_enter(); \
+ instrumentation_begin(); \
+ ftrace_nmi_enter(); \
+ instrumentation_end(); \
} while (0)
#define nmi_exit() \
do { \
+ instrumentation_begin(); \
+ ftrace_nmi_exit(); \
+ instrumentation_end(); \
lockdep_hardirq_exit(); \
rcu_nmi_exit(); \
BUG_ON(!in_nmi()); \
__preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \
- ftrace_nmi_exit(); \
lockdep_on(); \
printk_nmi_exit(); \
arch_nmi_exit(); \
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 80f637c..5db970b 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -760,8 +760,10 @@
/*
* We want to know which function is an entrypoint of a hardirq or a softirq.
*/
-#define __irq_entry __attribute__((__section__(".irqentry.text")))
-#define __softirq_entry \
- __attribute__((__section__(".softirqentry.text")))
+#ifndef __irq_entry
+# define __irq_entry __attribute__((__section__(".irqentry.text")))
+#endif
+
+#define __softirq_entry __attribute__((__section__(".softirqentry.text")))
#endif
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index d7f7e43..6384d28 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -32,7 +32,7 @@
#ifdef CONFIG_TRACE_IRQFLAGS
extern void trace_hardirqs_on_prepare(void);
- extern void trace_hardirqs_off_prepare(void);
+ extern void trace_hardirqs_off_finish(void);
extern void trace_hardirqs_on(void);
extern void trace_hardirqs_off(void);
# define lockdep_hardirq_context(p) ((p)->hardirq_context)
@@ -101,7 +101,7 @@
#else
# define trace_hardirqs_on_prepare() do { } while (0)
-# define trace_hardirqs_off_prepare() do { } while (0)
+# define trace_hardirqs_off_finish() do { } while (0)
# define trace_hardirqs_on() do { } while (0)
# define trace_hardirqs_off() do { } while (0)
# define lockdep_hardirq_context(p) 0
diff --git a/include/linux/key.h b/include/linux/key.h
index 6cf8e71..0f2e24f 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -71,6 +71,23 @@
#define KEY_PERM_UNDEF 0xffffffff
+/*
+ * The permissions required on a key that we're looking up.
+ */
+enum key_need_perm {
+ KEY_NEED_UNSPECIFIED, /* Needed permission unspecified */
+ KEY_NEED_VIEW, /* Require permission to view attributes */
+ KEY_NEED_READ, /* Require permission to read content */
+ KEY_NEED_WRITE, /* Require permission to update / modify */
+ KEY_NEED_SEARCH, /* Require permission to search (keyring) or find (key) */
+ KEY_NEED_LINK, /* Require permission to link */
+ KEY_NEED_SETATTR, /* Require permission to change attributes */
+ KEY_NEED_UNLINK, /* Require permission to unlink key */
+ KEY_SYSADMIN_OVERRIDE, /* Special: override by CAP_SYS_ADMIN */
+ KEY_AUTHTOKEN_OVERRIDE, /* Special: override by possession of auth token */
+ KEY_DEFER_PERM_CHECK, /* Special: permission check is deferred */
+};
+
struct seq_file;
struct user_struct;
struct signal_struct;
@@ -176,6 +193,9 @@
struct list_head graveyard_link;
struct rb_node serial_node;
};
+#ifdef CONFIG_KEY_NOTIFICATIONS
+ struct watch_list *watchers; /* Entities watching this key for changes */
+#endif
struct rw_semaphore sem; /* change vs change sem */
struct key_user *user; /* owner of this key */
void *security; /* security data for this key */
@@ -417,20 +437,9 @@
extern void key_set_timeout(struct key *, unsigned);
extern key_ref_t lookup_user_key(key_serial_t id, unsigned long flags,
- key_perm_t perm);
+ enum key_need_perm need_perm);
extern void key_free_user_ns(struct user_namespace *);
-/*
- * The permissions required on a key that we're looking up.
- */
-#define KEY_NEED_VIEW 0x01 /* Require permission to view attributes */
-#define KEY_NEED_READ 0x02 /* Require permission to read content */
-#define KEY_NEED_WRITE 0x04 /* Require permission to update / modify */
-#define KEY_NEED_SEARCH 0x08 /* Require permission to search (keyring) or find (key) */
-#define KEY_NEED_LINK 0x10 /* Require permission to link */
-#define KEY_NEED_SETATTR 0x20 /* Require permission to change attributes */
-#define KEY_NEED_ALL 0x3f /* All the above permissions */
-
static inline short key_read_state(const struct key *key)
{
/* Barrier versus mark_key_instantiated(). */
diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h
index 99d629f..28f23b3 100644
--- a/include/linux/lsm_audit.h
+++ b/include/linux/lsm_audit.h
@@ -75,6 +75,7 @@
#define LSM_AUDIT_DATA_IBPKEY 13
#define LSM_AUDIT_DATA_IBENDPORT 14
#define LSM_AUDIT_DATA_LOCKDOWN 15
+#define LSM_AUDIT_DATA_NOTIFICATION 16
union {
struct path path;
struct dentry *dentry;
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index fb3ce6c..4a3d70ba 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -254,6 +254,15 @@
LSM_HOOK(int, 0, inode_getsecctx, struct inode *inode, void **ctx,
u32 *ctxlen)
+#if defined(CONFIG_SECURITY) && defined(CONFIG_WATCH_QUEUE)
+LSM_HOOK(int, 0, post_notification, const struct cred *w_cred,
+ const struct cred *cred, struct watch_notification *n)
+#endif /* CONFIG_SECURITY && CONFIG_WATCH_QUEUE */
+
+#if defined(CONFIG_SECURITY) && defined(CONFIG_KEY_NOTIFICATIONS)
+LSM_HOOK(int, 0, watch_key, struct key *key)
+#endif /* CONFIG_SECURITY && CONFIG_KEY_NOTIFICATIONS */
+
#ifdef CONFIG_SECURITY_NETWORK
LSM_HOOK(int, 0, unix_stream_connect, struct sock *sock, struct sock *other,
struct sock *newsk)
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 3e62dab..73d8795 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1445,6 +1445,20 @@
* @ctx is a pointer in which to place the allocated security context.
* @ctxlen points to the place to put the length of @ctx.
*
+ * Security hooks for the general notification queue:
+ *
+ * @post_notification:
+ * Check to see if a watch notification can be posted to a particular
+ * queue.
+ * @w_cred: The credentials of the whoever set the watch.
+ * @cred: The event-triggerer's credentials
+ * @n: The notification being posted
+ *
+ * @watch_key:
+ * Check to see if a process is allowed to watch for event notifications
+ * from a key or keyring.
+ * @key: The key to watch.
+ *
* Security hooks for using the eBPF maps and programs functionalities through
* eBPF syscalls.
*
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 9a57e67..0ad5769 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -550,6 +550,7 @@
#define PCI_DEVICE_ID_AMD_17H_DF_F3 0x1463
#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F3 0x15eb
#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F3 0x1493
+#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F3 0x144b
#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F3 0x1443
#define PCI_DEVICE_ID_AMD_19H_DF_F3 0x1653
#define PCI_DEVICE_ID_AMD_CNB17H_F3 0x1703
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 0c31b94..50afd0d 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -9,6 +9,10 @@
#define PIPE_BUF_FLAG_GIFT 0x04 /* page is a gift */
#define PIPE_BUF_FLAG_PACKET 0x08 /* read() as a packet */
#define PIPE_BUF_FLAG_CAN_MERGE 0x10 /* can merge buffers */
+#define PIPE_BUF_FLAG_WHOLE 0x20 /* read() must return entire buffer or error */
+#ifdef CONFIG_WATCH_QUEUE
+#define PIPE_BUF_FLAG_LOSS 0x40 /* Message loss happened after this buffer */
+#endif
/**
* struct pipe_buffer - a linux kernel pipe buffer
@@ -34,8 +38,10 @@
* @wr_wait: writer wait point in case of full pipe
* @head: The point of buffer production
* @tail: The point of buffer consumption
+ * @note_loss: The next read() should insert a data-lost message
* @max_usage: The maximum number of slots that may be used in the ring
* @ring_size: total number of buffers (should be a power of 2)
+ * @nr_accounted: The amount this pipe accounts for in user->pipe_bufs
* @tmp_page: cached released page
* @readers: number of current readers of this pipe
* @writers: number of current writers of this pipe
@@ -46,6 +52,7 @@
* @fasync_writers: writer side fasync
* @bufs: the circular array of pipe buffers
* @user: the user who created this pipe
+ * @watch_queue: If this pipe is a watch_queue, this is the stuff for that
**/
struct pipe_inode_info {
struct mutex mutex;
@@ -54,6 +61,10 @@
unsigned int tail;
unsigned int max_usage;
unsigned int ring_size;
+#ifdef CONFIG_WATCH_QUEUE
+ bool note_loss;
+#endif
+ unsigned int nr_accounted;
unsigned int readers;
unsigned int writers;
unsigned int files;
@@ -64,6 +75,9 @@
struct fasync_struct *fasync_writers;
struct pipe_buffer *bufs;
struct user_struct *user;
+#ifdef CONFIG_WATCH_QUEUE
+ struct watch_queue *watch_queue;
+#endif
};
/*
@@ -239,9 +253,20 @@
extern const struct pipe_buf_operations nosteal_pipe_buf_ops;
+#ifdef CONFIG_WATCH_QUEUE
+unsigned long account_pipe_buffers(struct user_struct *user,
+ unsigned long old, unsigned long new);
+bool too_many_pipe_buffers_soft(unsigned long user_bufs);
+bool too_many_pipe_buffers_hard(unsigned long user_bufs);
+bool pipe_is_unprivileged_user(void);
+#endif
+
/* for F_SETPIPE_SZ and F_GETPIPE_SZ */
+#ifdef CONFIG_WATCH_QUEUE
+int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots);
+#endif
long pipe_fcntl(struct file *, unsigned int, unsigned long arg);
-struct pipe_inode_info *get_pipe_info(struct file *file);
+struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice);
int create_pipe_files(struct file **, int);
unsigned int round_pipe_size(unsigned long size);
diff --git a/include/linux/ras.h b/include/linux/ras.h
index 7c3debb..1f4048b 100644
--- a/include/linux/ras.h
+++ b/include/linux/ras.h
@@ -17,12 +17,7 @@
#endif
#ifdef CONFIG_RAS_CEC
-void __init cec_init(void);
int __init parse_cec_param(char *str);
-int cec_add_elem(u64 pfn);
-#else
-static inline void __init cec_init(void) { }
-static inline int cec_add_elem(u64 pfn) { return -ENODEV; }
#endif
#ifdef CONFIG_RAS
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4ea612e..b62e6aa 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1308,7 +1308,9 @@
#ifdef CONFIG_X86_MCE
u64 mce_addr;
- u64 mce_status;
+ __u64 mce_ripv : 1,
+ mce_whole_page : 1,
+ __mce_reserved : 62;
struct callback_head mce_kill_me;
#endif
diff --git a/include/linux/security.h b/include/linux/security.h
index b3f2cb2..469fa91 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -56,6 +56,8 @@
struct fs_context;
struct fs_parameter;
enum fs_value_type;
+struct watch;
+struct watch_notification;
/* Default (no) options for the capable function */
#define CAP_OPT_NONE 0x0
@@ -1282,6 +1284,28 @@
}
#endif /* CONFIG_SECURITY */
+#if defined(CONFIG_SECURITY) && defined(CONFIG_WATCH_QUEUE)
+int security_post_notification(const struct cred *w_cred,
+ const struct cred *cred,
+ struct watch_notification *n);
+#else
+static inline int security_post_notification(const struct cred *w_cred,
+ const struct cred *cred,
+ struct watch_notification *n)
+{
+ return 0;
+}
+#endif
+
+#if defined(CONFIG_SECURITY) && defined(CONFIG_KEY_NOTIFICATIONS)
+int security_watch_key(struct key *key);
+#else
+static inline int security_watch_key(struct key *key)
+{
+ return 0;
+}
+#endif
+
#ifdef CONFIG_SECURITY_NETWORK
int security_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk);
@@ -1750,8 +1774,8 @@
int security_key_alloc(struct key *key, const struct cred *cred, unsigned long flags);
void security_key_free(struct key *key);
-int security_key_permission(key_ref_t key_ref,
- const struct cred *cred, unsigned perm);
+int security_key_permission(key_ref_t key_ref, const struct cred *cred,
+ enum key_need_perm need_perm);
int security_key_getsecurity(struct key *key, char **_buffer);
#else
@@ -1769,7 +1793,7 @@
static inline int security_key_permission(key_ref_t key_ref,
const struct cred *cred,
- unsigned perm)
+ enum key_need_perm need_perm)
{
return 0;
}
diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h
index 86281ac..860e0f84 100644
--- a/include/linux/set_memory.h
+++ b/include/linux/set_memory.h
@@ -26,7 +26,7 @@
#endif
#ifndef set_mce_nospec
-static inline int set_mce_nospec(unsigned long pfn)
+static inline int set_mce_nospec(unsigned long pfn, bool unmap)
{
return 0;
}
diff --git a/include/linux/watch_queue.h b/include/linux/watch_queue.h
new file mode 100644
index 0000000..5e08db2
--- /dev/null
+++ b/include/linux/watch_queue.h
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0
+/* User-mappable watch queue
+ *
+ * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * See Documentation/watch_queue.rst
+ */
+
+#ifndef _LINUX_WATCH_QUEUE_H
+#define _LINUX_WATCH_QUEUE_H
+
+#include <uapi/linux/watch_queue.h>
+#include <linux/kref.h>
+#include <linux/rcupdate.h>
+
+#ifdef CONFIG_WATCH_QUEUE
+
+struct cred;
+
+struct watch_type_filter {
+ enum watch_notification_type type;
+ __u32 subtype_filter[1]; /* Bitmask of subtypes to filter on */
+ __u32 info_filter; /* Filter on watch_notification::info */
+ __u32 info_mask; /* Mask of relevant bits in info_filter */
+};
+
+struct watch_filter {
+ union {
+ struct rcu_head rcu;
+ unsigned long type_filter[2]; /* Bitmask of accepted types */
+ };
+ u32 nr_filters; /* Number of filters */
+ struct watch_type_filter filters[];
+};
+
+struct watch_queue {
+ struct rcu_head rcu;
+ struct watch_filter __rcu *filter;
+ struct pipe_inode_info *pipe; /* The pipe we're using as a buffer */
+ struct hlist_head watches; /* Contributory watches */
+ struct page **notes; /* Preallocated notifications */
+ unsigned long *notes_bitmap; /* Allocation bitmap for notes */
+ struct kref usage; /* Object usage count */
+ spinlock_t lock;
+ unsigned int nr_notes; /* Number of notes */
+ unsigned int nr_pages; /* Number of pages in notes[] */
+ bool defunct; /* T when queues closed */
+};
+
+/*
+ * Representation of a watch on an object.
+ */
+struct watch {
+ union {
+ struct rcu_head rcu;
+ u32 info_id; /* ID to be OR'd in to info field */
+ };
+ struct watch_queue __rcu *queue; /* Queue to post events to */
+ struct hlist_node queue_node; /* Link in queue->watches */
+ struct watch_list __rcu *watch_list;
+ struct hlist_node list_node; /* Link in watch_list->watchers */
+ const struct cred *cred; /* Creds of the owner of the watch */
+ void *private; /* Private data for the watched object */
+ u64 id; /* Internal identifier */
+ struct kref usage; /* Object usage count */
+};
+
+/*
+ * List of watches on an object.
+ */
+struct watch_list {
+ struct rcu_head rcu;
+ struct hlist_head watchers;
+ void (*release_watch)(struct watch *);
+ spinlock_t lock;
+};
+
+extern void __post_watch_notification(struct watch_list *,
+ struct watch_notification *,
+ const struct cred *,
+ u64);
+extern struct watch_queue *get_watch_queue(int);
+extern void put_watch_queue(struct watch_queue *);
+extern void init_watch(struct watch *, struct watch_queue *);
+extern int add_watch_to_object(struct watch *, struct watch_list *);
+extern int remove_watch_from_object(struct watch_list *, struct watch_queue *, u64, bool);
+extern long watch_queue_set_size(struct pipe_inode_info *, unsigned int);
+extern long watch_queue_set_filter(struct pipe_inode_info *,
+ struct watch_notification_filter __user *);
+extern int watch_queue_init(struct pipe_inode_info *);
+extern void watch_queue_clear(struct watch_queue *);
+
+static inline void init_watch_list(struct watch_list *wlist,
+ void (*release_watch)(struct watch *))
+{
+ INIT_HLIST_HEAD(&wlist->watchers);
+ spin_lock_init(&wlist->lock);
+ wlist->release_watch = release_watch;
+}
+
+static inline void post_watch_notification(struct watch_list *wlist,
+ struct watch_notification *n,
+ const struct cred *cred,
+ u64 id)
+{
+ if (unlikely(wlist))
+ __post_watch_notification(wlist, n, cred, id);
+}
+
+static inline void remove_watch_list(struct watch_list *wlist, u64 id)
+{
+ if (wlist) {
+ remove_watch_from_object(wlist, NULL, id, true);
+ kfree_rcu(wlist, rcu);
+ }
+}
+
+/**
+ * watch_sizeof - Calculate the information part of the size of a watch record,
+ * given the structure size.
+ */
+#define watch_sizeof(STRUCT) (sizeof(STRUCT) << WATCH_INFO_LENGTH__SHIFT)
+
+#endif
+
+#endif /* _LINUX_WATCH_QUEUE_H */
diff --git a/include/uapi/linux/keyctl.h b/include/uapi/linux/keyctl.h
index ed3d589..4c8884e 100644
--- a/include/uapi/linux/keyctl.h
+++ b/include/uapi/linux/keyctl.h
@@ -69,6 +69,7 @@
#define KEYCTL_RESTRICT_KEYRING 29 /* Restrict keys allowed to link to a keyring */
#define KEYCTL_MOVE 30 /* Move keys between keyrings */
#define KEYCTL_CAPABILITIES 31 /* Find capabilities of keyrings subsystem */
+#define KEYCTL_WATCH_KEY 32 /* Watch a key or ring of keys for changes */
/* keyctl structures */
struct keyctl_dh_params {
@@ -130,5 +131,6 @@
#define KEYCTL_CAPS0_MOVE 0x80 /* KEYCTL_MOVE supported */
#define KEYCTL_CAPS1_NS_KEYRING_NAME 0x01 /* Keyring names are per-user_namespace */
#define KEYCTL_CAPS1_NS_KEY_TAG 0x02 /* Key indexing can include a namespace tag */
+#define KEYCTL_CAPS1_NOTIFICATIONS 0x04 /* Keys generate watchable notifications */
#endif /* _LINUX_KEYCTL_H */
diff --git a/include/uapi/linux/watch_queue.h b/include/uapi/linux/watch_queue.h
new file mode 100644
index 0000000..c3d8320
--- /dev/null
+++ b/include/uapi/linux/watch_queue.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_WATCH_QUEUE_H
+#define _UAPI_LINUX_WATCH_QUEUE_H
+
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/ioctl.h>
+
+#define O_NOTIFICATION_PIPE O_EXCL /* Parameter to pipe2() selecting notification pipe */
+
+#define IOC_WATCH_QUEUE_SET_SIZE _IO('W', 0x60) /* Set the size in pages */
+#define IOC_WATCH_QUEUE_SET_FILTER _IO('W', 0x61) /* Set the filter */
+
+enum watch_notification_type {
+ WATCH_TYPE_META = 0, /* Special record */
+ WATCH_TYPE_KEY_NOTIFY = 1, /* Key change event notification */
+ WATCH_TYPE__NR = 2
+};
+
+enum watch_meta_notification_subtype {
+ WATCH_META_REMOVAL_NOTIFICATION = 0, /* Watched object was removed */
+ WATCH_META_LOSS_NOTIFICATION = 1, /* Data loss occurred */
+};
+
+/*
+ * Notification record header. This is aligned to 64-bits so that subclasses
+ * can contain __u64 fields.
+ */
+struct watch_notification {
+ __u32 type:24; /* enum watch_notification_type */
+ __u32 subtype:8; /* Type-specific subtype (filterable) */
+ __u32 info;
+#define WATCH_INFO_LENGTH 0x0000007f /* Length of record */
+#define WATCH_INFO_LENGTH__SHIFT 0
+#define WATCH_INFO_ID 0x0000ff00 /* ID of watchpoint */
+#define WATCH_INFO_ID__SHIFT 8
+#define WATCH_INFO_TYPE_INFO 0xffff0000 /* Type-specific info */
+#define WATCH_INFO_TYPE_INFO__SHIFT 16
+#define WATCH_INFO_FLAG_0 0x00010000 /* Type-specific info, flag bit 0 */
+#define WATCH_INFO_FLAG_1 0x00020000 /* ... */
+#define WATCH_INFO_FLAG_2 0x00040000
+#define WATCH_INFO_FLAG_3 0x00080000
+#define WATCH_INFO_FLAG_4 0x00100000
+#define WATCH_INFO_FLAG_5 0x00200000
+#define WATCH_INFO_FLAG_6 0x00400000
+#define WATCH_INFO_FLAG_7 0x00800000
+};
+
+/*
+ * Notification filtering rules (IOC_WATCH_QUEUE_SET_FILTER).
+ */
+struct watch_notification_type_filter {
+ __u32 type; /* Type to apply filter to */
+ __u32 info_filter; /* Filter on watch_notification::info */
+ __u32 info_mask; /* Mask of relevant bits in info_filter */
+ __u32 subtype_filter[8]; /* Bitmask of subtypes to filter on */
+};
+
+struct watch_notification_filter {
+ __u32 nr_filters; /* Number of filters */
+ __u32 __reserved; /* Must be 0 */
+ struct watch_notification_type_filter filters[];
+};
+
+
+/*
+ * Extended watch removal notification. This is used optionally if the type
+ * wants to indicate an identifier for the object being watched, if there is
+ * such. This can be distinguished by the length.
+ *
+ * type -> WATCH_TYPE_META
+ * subtype -> WATCH_META_REMOVAL_NOTIFICATION
+ */
+struct watch_notification_removal {
+ struct watch_notification watch;
+ __u64 id; /* Type-dependent identifier */
+};
+
+/*
+ * Type of key/keyring change notification.
+ */
+enum key_notification_subtype {
+ NOTIFY_KEY_INSTANTIATED = 0, /* Key was instantiated (aux is error code) */
+ NOTIFY_KEY_UPDATED = 1, /* Key was updated */
+ NOTIFY_KEY_LINKED = 2, /* Key (aux) was added to watched keyring */
+ NOTIFY_KEY_UNLINKED = 3, /* Key (aux) was removed from watched keyring */
+ NOTIFY_KEY_CLEARED = 4, /* Keyring was cleared */
+ NOTIFY_KEY_REVOKED = 5, /* Key was revoked */
+ NOTIFY_KEY_INVALIDATED = 6, /* Key was invalidated */
+ NOTIFY_KEY_SETATTR = 7, /* Key's attributes got changed */
+};
+
+/*
+ * Key/keyring notification record.
+ * - watch.type = WATCH_TYPE_KEY_NOTIFY
+ * - watch.subtype = enum key_notification_type
+ */
+struct key_notification {
+ struct watch_notification watch;
+ __u32 key_id; /* The key/keyring affected */
+ __u32 aux; /* Per-type auxiliary data */
+};
+
+#endif /* _UAPI_LINUX_WATCH_QUEUE_H */
diff --git a/include/xen/events.h b/include/xen/events.h
index 12b0dcb..df1e6391 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -90,13 +90,6 @@
int irq_from_virq(unsigned int cpu, unsigned int virq);
evtchn_port_t evtchn_from_irq(unsigned irq);
-#ifdef CONFIG_XEN_PVHVM
-/* Xen HVM evtchn vector callback */
-void xen_hvm_callback_vector(void);
-#ifdef CONFIG_TRACING
-#define trace_xen_hvm_callback_vector xen_hvm_callback_vector
-#endif
-#endif
int xen_set_callback_via(uint64_t via);
void xen_evtchn_do_upcall(struct pt_regs *regs);
void xen_hvm_evtchn_do_upcall(void);
diff --git a/include/xen/hvm.h b/include/xen/hvm.h
index 0b15f8c..b7fd7fc 100644
--- a/include/xen/hvm.h
+++ b/include/xen/hvm.h
@@ -58,4 +58,6 @@
#define HVM_CALLBACK_VECTOR(x) (((uint64_t)HVM_CALLBACK_VIA_TYPE_VECTOR)<<\
HVM_CALLBACK_VIA_TYPE_SHIFT | (x))
+void xen_setup_callback_vector(void);
+
#endif /* XEN_HVM_H__ */
diff --git a/include/xen/interface/hvm/hvm_op.h b/include/xen/interface/hvm/hvm_op.h
index 956a046..25d945e 100644
--- a/include/xen/interface/hvm/hvm_op.h
+++ b/include/xen/interface/hvm/hvm_op.h
@@ -21,6 +21,8 @@
#ifndef __XEN_PUBLIC_HVM_HVM_OP_H__
#define __XEN_PUBLIC_HVM_HVM_OP_H__
+#include <xen/interface/xen.h>
+
/* Get/set subcommands: the second argument of the hypercall is a
* pointer to a xen_hvm_param struct. */
#define HVMOP_set_param 0
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 095be1d..39a5580 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -215,17 +215,7 @@
void xen_efi_runtime_setup(void);
-#ifdef CONFIG_PREEMPTION
-
-static inline void xen_preemptible_hcall_begin(void)
-{
-}
-
-static inline void xen_preemptible_hcall_end(void)
-{
-}
-
-#else
+#if defined(CONFIG_XEN_PV) && !defined(CONFIG_PREEMPTION)
DECLARE_PER_CPU(bool, xen_in_preemptible_hcall);
@@ -239,6 +229,11 @@
__this_cpu_write(xen_in_preemptible_hcall, false);
}
-#endif /* CONFIG_PREEMPTION */
+#else
+
+static inline void xen_preemptible_hcall_begin(void) { }
+static inline void xen_preemptible_hcall_end(void) { }
+
+#endif /* CONFIG_XEN_PV && !CONFIG_PREEMPTION */
#endif /* INCLUDE_XEN_OPS_H */
diff --git a/init/Kconfig b/init/Kconfig
index 49eb7a3..b561ecf 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -367,6 +367,18 @@
depends on SYSCTL
default y
+config WATCH_QUEUE
+ bool "General notification queue"
+ default n
+ help
+
+ This is a general notification queue for the kernel to pass events to
+ userspace by splicing them into pipes. It can be used in conjunction
+ with watches for key/keyring change notifications and device
+ notifications.
+
+ See Documentation/watch_queue.rst
+
config CROSS_MEMORY_ATTACH
bool "Enable process_vm_readv/writev syscalls"
depends on MMU
diff --git a/kernel/Makefile b/kernel/Makefile
index ce8716a..f3218bc 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -121,6 +121,7 @@
obj-$(CONFIG_HAS_IOMEM) += iomem.o
obj-$(CONFIG_RSEQ) += rseq.o
+obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o
obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index ce43088..36a98c48 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -31,7 +31,7 @@
DEFINE_PER_CPU(struct context_tracking, context_tracking);
EXPORT_SYMBOL_GPL(context_tracking);
-static bool context_tracking_recursion_enter(void)
+static noinstr bool context_tracking_recursion_enter(void)
{
int recursion;
@@ -45,7 +45,7 @@
return false;
}
-static void context_tracking_recursion_exit(void)
+static __always_inline void context_tracking_recursion_exit(void)
{
__this_cpu_dec(context_tracking.recursion);
}
@@ -59,7 +59,7 @@
* instructions to execute won't use any RCU read side critical section
* because this function sets RCU in extended quiescent state.
*/
-void __context_tracking_enter(enum ctx_state state)
+void noinstr __context_tracking_enter(enum ctx_state state)
{
/* Kernel threads aren't supposed to go to userspace */
WARN_ON_ONCE(!current->mm);
@@ -77,8 +77,10 @@
* on the tick.
*/
if (state == CONTEXT_USER) {
+ instrumentation_begin();
trace_user_enter(0);
vtime_user_enter(current);
+ instrumentation_end();
}
rcu_user_enter();
}
@@ -99,7 +101,6 @@
}
context_tracking_recursion_exit();
}
-NOKPROBE_SYMBOL(__context_tracking_enter);
EXPORT_SYMBOL_GPL(__context_tracking_enter);
void context_tracking_enter(enum ctx_state state)
@@ -142,7 +143,7 @@
* This call supports re-entrancy. This way it can be called from any exception
* handler without needing to know if we came from userspace or not.
*/
-void __context_tracking_exit(enum ctx_state state)
+void noinstr __context_tracking_exit(enum ctx_state state)
{
if (!context_tracking_recursion_enter())
return;
@@ -155,15 +156,16 @@
*/
rcu_user_exit();
if (state == CONTEXT_USER) {
+ instrumentation_begin();
vtime_user_exit(current);
trace_user_exit(0);
+ instrumentation_end();
}
}
__this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
}
context_tracking_recursion_exit();
}
-NOKPROBE_SYMBOL(__context_tracking_exit);
EXPORT_SYMBOL_GPL(__context_tracking_exit);
void context_tracking_exit(enum ctx_state state)
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 38cce34..29a8de4 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -393,7 +393,7 @@
task->lockdep_recursion = 0;
}
-static inline void lockdep_recursion_finish(void)
+static __always_inline void lockdep_recursion_finish(void)
{
if (WARN_ON_ONCE(--current->lockdep_recursion))
current->lockdep_recursion = 0;
@@ -801,7 +801,7 @@
}
/* used from NMI context -- must be lockless */
-static inline struct lock_class *
+static __always_inline struct lock_class *
look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
{
struct lockdep_subclass_key *key;
diff --git a/kernel/panic.c b/kernel/panic.c
index 85568bb..e2157ca 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -680,10 +680,12 @@
* Called when gcc's -fstack-protector feature is used, and
* gcc detects corruption of the on-stack canary value
*/
-__visible void __stack_chk_fail(void)
+__visible noinstr void __stack_chk_fail(void)
{
+ instrumentation_begin();
panic("stack-protector: Kernel stack is corrupted in: %pB",
__builtin_return_address(0));
+ instrumentation_end();
}
EXPORT_SYMBOL(__stack_chk_fail);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index a47c6dd..c4201b7f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -339,12 +339,11 @@
local_irq_restore(flags);
}
-/*
- * Enter an interrupt context.
+/**
+ * irq_enter_rcu - Enter an interrupt context with RCU watching
*/
-void irq_enter(void)
+void irq_enter_rcu(void)
{
- rcu_irq_enter();
if (is_idle_task(current) && !in_interrupt()) {
/*
* Prevent raise_softirq from needlessly waking up ksoftirqd
@@ -354,10 +353,18 @@
tick_irq_enter();
_local_bh_enable();
}
-
__irq_enter();
}
+/**
+ * irq_enter - Enter an interrupt context including RCU update
+ */
+void irq_enter(void)
+{
+ rcu_irq_enter();
+ irq_enter_rcu();
+}
+
static inline void invoke_softirq(void)
{
if (ksoftirqd_running(local_softirq_pending()))
@@ -397,10 +404,7 @@
#endif
}
-/*
- * Exit an interrupt context. Process softirqs if needed and possible:
- */
-void irq_exit(void)
+static inline void __irq_exit_rcu(void)
{
#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
local_irq_disable();
@@ -413,6 +417,28 @@
invoke_softirq();
tick_irq_exit();
+}
+
+/**
+ * irq_exit_rcu() - Exit an interrupt context without updating RCU
+ *
+ * Also processes softirqs if needed and possible.
+ */
+void irq_exit_rcu(void)
+{
+ __irq_exit_rcu();
+ /* must be last! */
+ lockdep_hardirq_exit();
+}
+
+/**
+ * irq_exit - Exit an interrupt context, update RCU and lockdep
+ *
+ * Also processes softirqs if needed and possible.
+ */
+void irq_exit(void)
+{
+ __irq_exit_rcu();
rcu_irq_exit();
/* must be last! */
lockdep_hardirq_exit();
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 9ebaab1..d20d489 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -953,7 +953,7 @@
* but without the sequence counter protect. This internal function
* is called just when timekeeping lock is already held.
*/
-time64_t __ktime_get_real_seconds(void)
+noinstr time64_t __ktime_get_real_seconds(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
index fb0691b..f10073e 100644
--- a/kernel/trace/trace_preemptirq.c
+++ b/kernel/trace/trace_preemptirq.c
@@ -58,7 +58,7 @@
* and lockdep uses a staged approach which splits the lockdep hardirq
* tracking into a RCU on and a RCU off section.
*/
-void trace_hardirqs_off_prepare(void)
+void trace_hardirqs_off_finish(void)
{
if (!this_cpu_read(tracing_irq_cpu)) {
this_cpu_write(tracing_irq_cpu, 1);
@@ -68,19 +68,19 @@
}
}
-EXPORT_SYMBOL(trace_hardirqs_off_prepare);
-NOKPROBE_SYMBOL(trace_hardirqs_off_prepare);
+EXPORT_SYMBOL(trace_hardirqs_off_finish);
+NOKPROBE_SYMBOL(trace_hardirqs_off_finish);
void trace_hardirqs_off(void)
{
+ lockdep_hardirqs_off(CALLER_ADDR0);
+
if (!this_cpu_read(tracing_irq_cpu)) {
this_cpu_write(tracing_irq_cpu, 1);
tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
if (!in_nmi())
trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
}
-
- lockdep_hardirqs_off(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_off);
NOKPROBE_SYMBOL(trace_hardirqs_off);
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
new file mode 100644
index 0000000..f74020f
--- /dev/null
+++ b/kernel/watch_queue.c
@@ -0,0 +1,655 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Watch queue and general notification mechanism, built on pipes
+ *
+ * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * See Documentation/watch_queue.rst
+ */
+
+#define pr_fmt(fmt) "watchq: " fmt
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/printk.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/poll.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/file.h>
+#include <linux/security.h>
+#include <linux/cred.h>
+#include <linux/sched/signal.h>
+#include <linux/watch_queue.h>
+#include <linux/pipe_fs_i.h>
+
+MODULE_DESCRIPTION("Watch queue");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+#define WATCH_QUEUE_NOTE_SIZE 128
+#define WATCH_QUEUE_NOTES_PER_PAGE (PAGE_SIZE / WATCH_QUEUE_NOTE_SIZE)
+
+static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ struct watch_queue *wqueue = (struct watch_queue *)buf->private;
+ struct page *page;
+ unsigned int bit;
+
+ /* We need to work out which note within the page this refers to, but
+ * the note might have been maximum size, so merely ANDing the offset
+ * off doesn't work. OTOH, the note must've been more than zero size.
+ */
+ bit = buf->offset + buf->len;
+ if ((bit & (WATCH_QUEUE_NOTE_SIZE - 1)) == 0)
+ bit -= WATCH_QUEUE_NOTE_SIZE;
+ bit /= WATCH_QUEUE_NOTE_SIZE;
+
+ page = buf->page;
+ bit += page->index;
+
+ set_bit(bit, wqueue->notes_bitmap);
+}
+
+// No try_steal function => no stealing
+#define watch_queue_pipe_buf_try_steal NULL
+
+/* New data written to a pipe may be appended to a buffer with this type. */
+static const struct pipe_buf_operations watch_queue_pipe_buf_ops = {
+ .release = watch_queue_pipe_buf_release,
+ .try_steal = watch_queue_pipe_buf_try_steal,
+ .get = generic_pipe_buf_get,
+};
+
+/*
+ * Post a notification to a watch queue.
+ */
+static bool post_one_notification(struct watch_queue *wqueue,
+ struct watch_notification *n)
+{
+ void *p;
+ struct pipe_inode_info *pipe = wqueue->pipe;
+ struct pipe_buffer *buf;
+ struct page *page;
+ unsigned int head, tail, mask, note, offset, len;
+ bool done = false;
+
+ if (!pipe)
+ return false;
+
+ spin_lock_irq(&pipe->rd_wait.lock);
+
+ if (wqueue->defunct)
+ goto out;
+
+ mask = pipe->ring_size - 1;
+ head = pipe->head;
+ tail = pipe->tail;
+ if (pipe_full(head, tail, pipe->ring_size))
+ goto lost;
+
+ note = find_first_bit(wqueue->notes_bitmap, wqueue->nr_notes);
+ if (note >= wqueue->nr_notes)
+ goto lost;
+
+ page = wqueue->notes[note / WATCH_QUEUE_NOTES_PER_PAGE];
+ offset = note % WATCH_QUEUE_NOTES_PER_PAGE * WATCH_QUEUE_NOTE_SIZE;
+ get_page(page);
+ len = n->info & WATCH_INFO_LENGTH;
+ p = kmap_atomic(page);
+ memcpy(p + offset, n, len);
+ kunmap_atomic(p);
+
+ buf = &pipe->bufs[head & mask];
+ buf->page = page;
+ buf->private = (unsigned long)wqueue;
+ buf->ops = &watch_queue_pipe_buf_ops;
+ buf->offset = offset;
+ buf->len = len;
+ buf->flags = PIPE_BUF_FLAG_WHOLE;
+ pipe->head = head + 1;
+
+ if (!test_and_clear_bit(note, wqueue->notes_bitmap)) {
+ spin_unlock_irq(&pipe->rd_wait.lock);
+ BUG();
+ }
+ wake_up_interruptible_sync_poll_locked(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
+ done = true;
+
+out:
+ spin_unlock_irq(&pipe->rd_wait.lock);
+ if (done)
+ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+ return done;
+
+lost:
+ buf = &pipe->bufs[(head - 1) & mask];
+ buf->flags |= PIPE_BUF_FLAG_LOSS;
+ goto out;
+}
+
+/*
+ * Apply filter rules to a notification.
+ */
+static bool filter_watch_notification(const struct watch_filter *wf,
+ const struct watch_notification *n)
+{
+ const struct watch_type_filter *wt;
+ unsigned int st_bits = sizeof(wt->subtype_filter[0]) * 8;
+ unsigned int st_index = n->subtype / st_bits;
+ unsigned int st_bit = 1U << (n->subtype % st_bits);
+ int i;
+
+ if (!test_bit(n->type, wf->type_filter))
+ return false;
+
+ for (i = 0; i < wf->nr_filters; i++) {
+ wt = &wf->filters[i];
+ if (n->type == wt->type &&
+ (wt->subtype_filter[st_index] & st_bit) &&
+ (n->info & wt->info_mask) == wt->info_filter)
+ return true;
+ }
+
+ return false; /* If there is a filter, the default is to reject. */
+}
+
+/**
+ * __post_watch_notification - Post an event notification
+ * @wlist: The watch list to post the event to.
+ * @n: The notification record to post.
+ * @cred: The creds of the process that triggered the notification.
+ * @id: The ID to match on the watch.
+ *
+ * Post a notification of an event into a set of watch queues and let the users
+ * know.
+ *
+ * The size of the notification should be set in n->info & WATCH_INFO_LENGTH and
+ * should be in units of sizeof(*n).
+ */
+void __post_watch_notification(struct watch_list *wlist,
+ struct watch_notification *n,
+ const struct cred *cred,
+ u64 id)
+{
+ const struct watch_filter *wf;
+ struct watch_queue *wqueue;
+ struct watch *watch;
+
+ if (((n->info & WATCH_INFO_LENGTH) >> WATCH_INFO_LENGTH__SHIFT) == 0) {
+ WARN_ON(1);
+ return;
+ }
+
+ rcu_read_lock();
+
+ hlist_for_each_entry_rcu(watch, &wlist->watchers, list_node) {
+ if (watch->id != id)
+ continue;
+ n->info &= ~WATCH_INFO_ID;
+ n->info |= watch->info_id;
+
+ wqueue = rcu_dereference(watch->queue);
+ wf = rcu_dereference(wqueue->filter);
+ if (wf && !filter_watch_notification(wf, n))
+ continue;
+
+ if (security_post_notification(watch->cred, cred, n) < 0)
+ continue;
+
+ post_one_notification(wqueue, n);
+ }
+
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(__post_watch_notification);
+
+/*
+ * Allocate sufficient pages to preallocation for the requested number of
+ * notifications.
+ */
+long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
+{
+ struct watch_queue *wqueue = pipe->watch_queue;
+ struct page **pages;
+ unsigned long *bitmap;
+ unsigned long user_bufs;
+ unsigned int bmsize;
+ int ret, i, nr_pages;
+
+ if (!wqueue)
+ return -ENODEV;
+ if (wqueue->notes)
+ return -EBUSY;
+
+ if (nr_notes < 1 ||
+ nr_notes > 512) /* TODO: choose a better hard limit */
+ return -EINVAL;
+
+ nr_pages = (nr_notes + WATCH_QUEUE_NOTES_PER_PAGE - 1);
+ nr_pages /= WATCH_QUEUE_NOTES_PER_PAGE;
+ user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_pages);
+
+ if (nr_pages > pipe->max_usage &&
+ (too_many_pipe_buffers_hard(user_bufs) ||
+ too_many_pipe_buffers_soft(user_bufs)) &&
+ pipe_is_unprivileged_user()) {
+ ret = -EPERM;
+ goto error;
+ }
+
+ ret = pipe_resize_ring(pipe, nr_notes);
+ if (ret < 0)
+ goto error;
+
+ pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL);
+ if (!pages)
+ goto error;
+
+ for (i = 0; i < nr_pages; i++) {
+ pages[i] = alloc_page(GFP_KERNEL);
+ if (!pages[i])
+ goto error_p;
+ pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE;
+ }
+
+ bmsize = (nr_notes + BITS_PER_LONG - 1) / BITS_PER_LONG;
+ bmsize *= sizeof(unsigned long);
+ bitmap = kmalloc(bmsize, GFP_KERNEL);
+ if (!bitmap)
+ goto error_p;
+
+ memset(bitmap, 0xff, bmsize);
+ wqueue->notes = pages;
+ wqueue->notes_bitmap = bitmap;
+ wqueue->nr_pages = nr_pages;
+ wqueue->nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE;
+ return 0;
+
+error_p:
+ for (i = 0; i < nr_pages; i++)
+ __free_page(pages[i]);
+ kfree(pages);
+error:
+ (void) account_pipe_buffers(pipe->user, nr_pages, pipe->nr_accounted);
+ return ret;
+}
+
+/*
+ * Set the filter on a watch queue.
+ */
+long watch_queue_set_filter(struct pipe_inode_info *pipe,
+ struct watch_notification_filter __user *_filter)
+{
+ struct watch_notification_type_filter *tf;
+ struct watch_notification_filter filter;
+ struct watch_type_filter *q;
+ struct watch_filter *wfilter;
+ struct watch_queue *wqueue = pipe->watch_queue;
+ int ret, nr_filter = 0, i;
+
+ if (!wqueue)
+ return -ENODEV;
+
+ if (!_filter) {
+ /* Remove the old filter */
+ wfilter = NULL;
+ goto set;
+ }
+
+ /* Grab the user's filter specification */
+ if (copy_from_user(&filter, _filter, sizeof(filter)) != 0)
+ return -EFAULT;
+ if (filter.nr_filters == 0 ||
+ filter.nr_filters > 16 ||
+ filter.__reserved != 0)
+ return -EINVAL;
+
+ tf = memdup_user(_filter->filters, filter.nr_filters * sizeof(*tf));
+ if (IS_ERR(tf))
+ return PTR_ERR(tf);
+
+ ret = -EINVAL;
+ for (i = 0; i < filter.nr_filters; i++) {
+ if ((tf[i].info_filter & ~tf[i].info_mask) ||
+ tf[i].info_mask & WATCH_INFO_LENGTH)
+ goto err_filter;
+ /* Ignore any unknown types */
+ if (tf[i].type >= sizeof(wfilter->type_filter) * 8)
+ continue;
+ nr_filter++;
+ }
+
+ /* Now we need to build the internal filter from only the relevant
+ * user-specified filters.
+ */
+ ret = -ENOMEM;
+ wfilter = kzalloc(struct_size(wfilter, filters, nr_filter), GFP_KERNEL);
+ if (!wfilter)
+ goto err_filter;
+ wfilter->nr_filters = nr_filter;
+
+ q = wfilter->filters;
+ for (i = 0; i < filter.nr_filters; i++) {
+ if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG)
+ continue;
+
+ q->type = tf[i].type;
+ q->info_filter = tf[i].info_filter;
+ q->info_mask = tf[i].info_mask;
+ q->subtype_filter[0] = tf[i].subtype_filter[0];
+ __set_bit(q->type, wfilter->type_filter);
+ q++;
+ }
+
+ kfree(tf);
+set:
+ pipe_lock(pipe);
+ wfilter = rcu_replace_pointer(wqueue->filter, wfilter,
+ lockdep_is_held(&pipe->mutex));
+ pipe_unlock(pipe);
+ if (wfilter)
+ kfree_rcu(wfilter, rcu);
+ return 0;
+
+err_filter:
+ kfree(tf);
+ return ret;
+}
+
+static void __put_watch_queue(struct kref *kref)
+{
+ struct watch_queue *wqueue =
+ container_of(kref, struct watch_queue, usage);
+ struct watch_filter *wfilter;
+ int i;
+
+ for (i = 0; i < wqueue->nr_pages; i++)
+ __free_page(wqueue->notes[i]);
+
+ wfilter = rcu_access_pointer(wqueue->filter);
+ if (wfilter)
+ kfree_rcu(wfilter, rcu);
+ kfree_rcu(wqueue, rcu);
+}
+
+/**
+ * put_watch_queue - Dispose of a ref on a watchqueue.
+ * @wqueue: The watch queue to unref.
+ */
+void put_watch_queue(struct watch_queue *wqueue)
+{
+ kref_put(&wqueue->usage, __put_watch_queue);
+}
+EXPORT_SYMBOL(put_watch_queue);
+
+static void free_watch(struct rcu_head *rcu)
+{
+ struct watch *watch = container_of(rcu, struct watch, rcu);
+
+ put_watch_queue(rcu_access_pointer(watch->queue));
+ put_cred(watch->cred);
+}
+
+static void __put_watch(struct kref *kref)
+{
+ struct watch *watch = container_of(kref, struct watch, usage);
+
+ call_rcu(&watch->rcu, free_watch);
+}
+
+/*
+ * Discard a watch.
+ */
+static void put_watch(struct watch *watch)
+{
+ kref_put(&watch->usage, __put_watch);
+}
+
+/**
+ * init_watch_queue - Initialise a watch
+ * @watch: The watch to initialise.
+ * @wqueue: The queue to assign.
+ *
+ * Initialise a watch and set the watch queue.
+ */
+void init_watch(struct watch *watch, struct watch_queue *wqueue)
+{
+ kref_init(&watch->usage);
+ INIT_HLIST_NODE(&watch->list_node);
+ INIT_HLIST_NODE(&watch->queue_node);
+ rcu_assign_pointer(watch->queue, wqueue);
+}
+
+/**
+ * add_watch_to_object - Add a watch on an object to a watch list
+ * @watch: The watch to add
+ * @wlist: The watch list to add to
+ *
+ * @watch->queue must have been set to point to the queue to post notifications
+ * to and the watch list of the object to be watched. @watch->cred must also
+ * have been set to the appropriate credentials and a ref taken on them.
+ *
+ * The caller must pin the queue and the list both and must hold the list
+ * locked against racing watch additions/removals.
+ */
+int add_watch_to_object(struct watch *watch, struct watch_list *wlist)
+{
+ struct watch_queue *wqueue = rcu_access_pointer(watch->queue);
+ struct watch *w;
+
+ hlist_for_each_entry(w, &wlist->watchers, list_node) {
+ struct watch_queue *wq = rcu_access_pointer(w->queue);
+ if (wqueue == wq && watch->id == w->id)
+ return -EBUSY;
+ }
+
+ watch->cred = get_current_cred();
+ rcu_assign_pointer(watch->watch_list, wlist);
+
+ spin_lock_bh(&wqueue->lock);
+ kref_get(&wqueue->usage);
+ kref_get(&watch->usage);
+ hlist_add_head(&watch->queue_node, &wqueue->watches);
+ spin_unlock_bh(&wqueue->lock);
+
+ hlist_add_head(&watch->list_node, &wlist->watchers);
+ return 0;
+}
+EXPORT_SYMBOL(add_watch_to_object);
+
+/**
+ * remove_watch_from_object - Remove a watch or all watches from an object.
+ * @wlist: The watch list to remove from
+ * @wq: The watch queue of interest (ignored if @all is true)
+ * @id: The ID of the watch to remove (ignored if @all is true)
+ * @all: True to remove all objects
+ *
+ * Remove a specific watch or all watches from an object. A notification is
+ * sent to the watcher to tell them that this happened.
+ */
+int remove_watch_from_object(struct watch_list *wlist, struct watch_queue *wq,
+ u64 id, bool all)
+{
+ struct watch_notification_removal n;
+ struct watch_queue *wqueue;
+ struct watch *watch;
+ int ret = -EBADSLT;
+
+ rcu_read_lock();
+
+again:
+ spin_lock(&wlist->lock);
+ hlist_for_each_entry(watch, &wlist->watchers, list_node) {
+ if (all ||
+ (watch->id == id && rcu_access_pointer(watch->queue) == wq))
+ goto found;
+ }
+ spin_unlock(&wlist->lock);
+ goto out;
+
+found:
+ ret = 0;
+ hlist_del_init_rcu(&watch->list_node);
+ rcu_assign_pointer(watch->watch_list, NULL);
+ spin_unlock(&wlist->lock);
+
+ /* We now own the reference on watch that used to belong to wlist. */
+
+ n.watch.type = WATCH_TYPE_META;
+ n.watch.subtype = WATCH_META_REMOVAL_NOTIFICATION;
+ n.watch.info = watch->info_id | watch_sizeof(n.watch);
+ n.id = id;
+ if (id != 0)
+ n.watch.info = watch->info_id | watch_sizeof(n);
+
+ wqueue = rcu_dereference(watch->queue);
+
+ /* We don't need the watch list lock for the next bit as RCU is
+ * protecting *wqueue from deallocation.
+ */
+ if (wqueue) {
+ post_one_notification(wqueue, &n.watch);
+
+ spin_lock_bh(&wqueue->lock);
+
+ if (!hlist_unhashed(&watch->queue_node)) {
+ hlist_del_init_rcu(&watch->queue_node);
+ put_watch(watch);
+ }
+
+ spin_unlock_bh(&wqueue->lock);
+ }
+
+ if (wlist->release_watch) {
+ void (*release_watch)(struct watch *);
+
+ release_watch = wlist->release_watch;
+ rcu_read_unlock();
+ (*release_watch)(watch);
+ rcu_read_lock();
+ }
+ put_watch(watch);
+
+ if (all && !hlist_empty(&wlist->watchers))
+ goto again;
+out:
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL(remove_watch_from_object);
+
+/*
+ * Remove all the watches that are contributory to a queue. This has the
+ * potential to race with removal of the watches by the destruction of the
+ * objects being watched or with the distribution of notifications.
+ */
+void watch_queue_clear(struct watch_queue *wqueue)
+{
+ struct watch_list *wlist;
+ struct watch *watch;
+ bool release;
+
+ rcu_read_lock();
+ spin_lock_bh(&wqueue->lock);
+
+ /* Prevent new additions and prevent notifications from happening */
+ wqueue->defunct = true;
+
+ while (!hlist_empty(&wqueue->watches)) {
+ watch = hlist_entry(wqueue->watches.first, struct watch, queue_node);
+ hlist_del_init_rcu(&watch->queue_node);
+ /* We now own a ref on the watch. */
+ spin_unlock_bh(&wqueue->lock);
+
+ /* We can't do the next bit under the queue lock as we need to
+ * get the list lock - which would cause a deadlock if someone
+ * was removing from the opposite direction at the same time or
+ * posting a notification.
+ */
+ wlist = rcu_dereference(watch->watch_list);
+ if (wlist) {
+ void (*release_watch)(struct watch *);
+
+ spin_lock(&wlist->lock);
+
+ release = !hlist_unhashed(&watch->list_node);
+ if (release) {
+ hlist_del_init_rcu(&watch->list_node);
+ rcu_assign_pointer(watch->watch_list, NULL);
+
+ /* We now own a second ref on the watch. */
+ }
+
+ release_watch = wlist->release_watch;
+ spin_unlock(&wlist->lock);
+
+ if (release) {
+ if (release_watch) {
+ rcu_read_unlock();
+ /* This might need to call dput(), so
+ * we have to drop all the locks.
+ */
+ (*release_watch)(watch);
+ rcu_read_lock();
+ }
+ put_watch(watch);
+ }
+ }
+
+ put_watch(watch);
+ spin_lock_bh(&wqueue->lock);
+ }
+
+ spin_unlock_bh(&wqueue->lock);
+ rcu_read_unlock();
+}
+
+/**
+ * get_watch_queue - Get a watch queue from its file descriptor.
+ * @fd: The fd to query.
+ */
+struct watch_queue *get_watch_queue(int fd)
+{
+ struct pipe_inode_info *pipe;
+ struct watch_queue *wqueue = ERR_PTR(-EINVAL);
+ struct fd f;
+
+ f = fdget(fd);
+ if (f.file) {
+ pipe = get_pipe_info(f.file, false);
+ if (pipe && pipe->watch_queue) {
+ wqueue = pipe->watch_queue;
+ kref_get(&wqueue->usage);
+ }
+ fdput(f);
+ }
+
+ return wqueue;
+}
+EXPORT_SYMBOL(get_watch_queue);
+
+/*
+ * Initialise a watch queue
+ */
+int watch_queue_init(struct pipe_inode_info *pipe)
+{
+ struct watch_queue *wqueue;
+
+ wqueue = kzalloc(sizeof(*wqueue), GFP_KERNEL);
+ if (!wqueue)
+ return -ENOMEM;
+
+ wqueue->pipe = pipe;
+ kref_init(&wqueue->usage);
+ spin_lock_init(&wqueue->lock);
+ INIT_HLIST_HEAD(&wqueue->watches);
+
+ pipe->watch_queue = wqueue;
+ return 0;
+}
diff --git a/lib/bsearch.c b/lib/bsearch.c
index 8b3aae5..bf86aa6 100644
--- a/lib/bsearch.c
+++ b/lib/bsearch.c
@@ -28,27 +28,9 @@
* the key and elements in the array are of the same type, you can use
* the same comparison function for both sort() and bsearch().
*/
-void *bsearch(const void *key, const void *base, size_t num, size_t size,
- cmp_func_t cmp)
+void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp)
{
- const char *pivot;
- int result;
-
- while (num > 0) {
- pivot = base + (num >> 1) * size;
- result = cmp(key, pivot);
-
- if (result == 0)
- return (void *)pivot;
-
- if (result > 0) {
- base = pivot + size;
- num--;
- }
- num >>= 1;
- }
-
- return NULL;
+ return __inline_bsearch(key, base, num, size, cmp);
}
EXPORT_SYMBOL(bsearch);
NOKPROBE_SYMBOL(bsearch);
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index bd95716..525222e 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -8,7 +8,7 @@
#include <linux/kprobes.h>
#include <linux/sched.h>
-notrace static nokprobe_inline
+noinstr static
unsigned int check_preemption_disabled(const char *what1, const char *what2)
{
int this_cpu = raw_smp_processor_id();
@@ -37,6 +37,7 @@
*/
preempt_disable_notrace();
+ instrumentation_begin();
if (!printk_ratelimit())
goto out_enable;
@@ -45,6 +46,7 @@
printk("caller is %pS\n", __builtin_return_address(0));
dump_stack();
+ instrumentation_end();
out_enable:
preempt_enable_no_resched_notrace();
@@ -52,16 +54,14 @@
return this_cpu;
}
-notrace unsigned int debug_smp_processor_id(void)
+noinstr unsigned int debug_smp_processor_id(void)
{
return check_preemption_disabled("smp_processor_id", "");
}
EXPORT_SYMBOL(debug_smp_processor_id);
-NOKPROBE_SYMBOL(debug_smp_processor_id);
-notrace void __this_cpu_preempt_check(const char *op)
+noinstr void __this_cpu_preempt_check(const char *op)
{
check_preemption_disabled("__this_cpu_", op);
}
EXPORT_SYMBOL(__this_cpu_preempt_check);
-NOKPROBE_SYMBOL(__this_cpu_preempt_check);
diff --git a/samples/Kconfig b/samples/Kconfig
index 0cbb614..add4311 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -209,4 +209,11 @@
bool "watchdog sample"
depends on CC_CAN_LINK
+config SAMPLE_WATCH_QUEUE
+ bool "Build example /dev/watch_queue notification consumer"
+ depends on HEADERS_INSTALL
+ help
+ Build example userspace program to use the new mount_notify(),
+ sb_notify() syscalls and the KEYCTL_WATCH_KEY keyctl() function.
+
endif # SAMPLES
diff --git a/samples/Makefile b/samples/Makefile
index 29c66aa..5b26297 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -27,3 +27,4 @@
subdir-$(CONFIG_SAMPLE_VFS) += vfs
obj-$(CONFIG_SAMPLE_INTEL_MEI) += mei/
subdir-$(CONFIG_SAMPLE_WATCHDOG) += watchdog
+subdir-$(CONFIG_SAMPLE_WATCH_QUEUE) += watch_queue
diff --git a/samples/watch_queue/Makefile b/samples/watch_queue/Makefile
new file mode 100644
index 0000000..8511fb6
--- /dev/null
+++ b/samples/watch_queue/Makefile
@@ -0,0 +1,7 @@
+# List of programs to build
+hostprogs := watch_test
+
+# Tell kbuild to always build the programs
+always-y := $(hostprogs)
+
+HOSTCFLAGS_watch_test.o += -I$(objtree)/usr/include
diff --git a/samples/watch_queue/watch_test.c b/samples/watch_queue/watch_test.c
new file mode 100644
index 0000000..46e618a
--- /dev/null
+++ b/samples/watch_queue/watch_test.c
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Use /dev/watch_queue to watch for notifications.
+ *
+ * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#define _GNU_SOURCE
+#include <stdbool.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <signal.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/ioctl.h>
+#include <limits.h>
+#include <linux/watch_queue.h>
+#include <linux/unistd.h>
+#include <linux/keyctl.h>
+
+#ifndef KEYCTL_WATCH_KEY
+#define KEYCTL_WATCH_KEY -1
+#endif
+#ifndef __NR_keyctl
+#define __NR_keyctl -1
+#endif
+
+#define BUF_SIZE 256
+
+static long keyctl_watch_key(int key, int watch_fd, int watch_id)
+{
+ return syscall(__NR_keyctl, KEYCTL_WATCH_KEY, key, watch_fd, watch_id);
+}
+
+static const char *key_subtypes[256] = {
+ [NOTIFY_KEY_INSTANTIATED] = "instantiated",
+ [NOTIFY_KEY_UPDATED] = "updated",
+ [NOTIFY_KEY_LINKED] = "linked",
+ [NOTIFY_KEY_UNLINKED] = "unlinked",
+ [NOTIFY_KEY_CLEARED] = "cleared",
+ [NOTIFY_KEY_REVOKED] = "revoked",
+ [NOTIFY_KEY_INVALIDATED] = "invalidated",
+ [NOTIFY_KEY_SETATTR] = "setattr",
+};
+
+static void saw_key_change(struct watch_notification *n, size_t len)
+{
+ struct key_notification *k = (struct key_notification *)n;
+
+ if (len != sizeof(struct key_notification)) {
+ fprintf(stderr, "Incorrect key message length\n");
+ return;
+ }
+
+ printf("KEY %08x change=%u[%s] aux=%u\n",
+ k->key_id, n->subtype, key_subtypes[n->subtype], k->aux);
+}
+
+/*
+ * Consume and display events.
+ */
+static void consumer(int fd)
+{
+ unsigned char buffer[433], *p, *end;
+ union {
+ struct watch_notification n;
+ unsigned char buf1[128];
+ } n;
+ ssize_t buf_len;
+
+ for (;;) {
+ buf_len = read(fd, buffer, sizeof(buffer));
+ if (buf_len == -1) {
+ perror("read");
+ exit(1);
+ }
+
+ if (buf_len == 0) {
+ printf("-- END --\n");
+ return;
+ }
+
+ if (buf_len > sizeof(buffer)) {
+ fprintf(stderr, "Read buffer overrun: %zd\n", buf_len);
+ return;
+ }
+
+ printf("read() = %zd\n", buf_len);
+
+ p = buffer;
+ end = buffer + buf_len;
+ while (p < end) {
+ size_t largest, len;
+
+ largest = end - p;
+ if (largest > 128)
+ largest = 128;
+ if (largest < sizeof(struct watch_notification)) {
+ fprintf(stderr, "Short message header: %zu\n", largest);
+ return;
+ }
+ memcpy(&n, p, largest);
+
+ printf("NOTIFY[%03zx]: ty=%06x sy=%02x i=%08x\n",
+ p - buffer, n.n.type, n.n.subtype, n.n.info);
+
+ len = n.n.info & WATCH_INFO_LENGTH;
+ if (len < sizeof(n.n) || len > largest) {
+ fprintf(stderr, "Bad message length: %zu/%zu\n", len, largest);
+ exit(1);
+ }
+
+ switch (n.n.type) {
+ case WATCH_TYPE_META:
+ switch (n.n.subtype) {
+ case WATCH_META_REMOVAL_NOTIFICATION:
+ printf("REMOVAL of watchpoint %08x\n",
+ (n.n.info & WATCH_INFO_ID) >>
+ WATCH_INFO_ID__SHIFT);
+ break;
+ case WATCH_META_LOSS_NOTIFICATION:
+ printf("-- LOSS --\n");
+ break;
+ default:
+ printf("other meta record\n");
+ break;
+ }
+ break;
+ case WATCH_TYPE_KEY_NOTIFY:
+ saw_key_change(&n.n, len);
+ break;
+ default:
+ printf("other type\n");
+ break;
+ }
+
+ p += len;
+ }
+ }
+}
+
+static struct watch_notification_filter filter = {
+ .nr_filters = 1,
+ .filters = {
+ [0] = {
+ .type = WATCH_TYPE_KEY_NOTIFY,
+ .subtype_filter[0] = UINT_MAX,
+ },
+ },
+};
+
+int main(int argc, char **argv)
+{
+ int pipefd[2], fd;
+
+ if (pipe2(pipefd, O_NOTIFICATION_PIPE) == -1) {
+ perror("pipe2");
+ exit(1);
+ }
+ fd = pipefd[0];
+
+ if (ioctl(fd, IOC_WATCH_QUEUE_SET_SIZE, BUF_SIZE) == -1) {
+ perror("watch_queue(size)");
+ exit(1);
+ }
+
+ if (ioctl(fd, IOC_WATCH_QUEUE_SET_FILTER, &filter) == -1) {
+ perror("watch_queue(filter)");
+ exit(1);
+ }
+
+ if (keyctl_watch_key(KEY_SPEC_SESSION_KEYRING, fd, 0x01) == -1) {
+ perror("keyctl");
+ exit(1);
+ }
+
+ if (keyctl_watch_key(KEY_SPEC_USER_KEYRING, fd, 0x02) == -1) {
+ perror("keyctl");
+ exit(1);
+ }
+
+ consumer(fd);
+ exit(0);
+}
diff --git a/security/keys/Kconfig b/security/keys/Kconfig
index 8153ea01..83bc234 100644
--- a/security/keys/Kconfig
+++ b/security/keys/Kconfig
@@ -114,3 +114,12 @@
in the kernel.
If you are unsure as to whether this is required, answer N.
+
+config KEY_NOTIFICATIONS
+ bool "Provide key/keyring change notifications"
+ depends on KEYS && WATCH_QUEUE
+ help
+ This option provides support for getting change notifications on keys
+ and keyrings on which the caller has View permission. This makes use
+ of the /dev/watch_queue misc device to handle the notification
+ buffer and provides KEYCTL_WATCH_KEY to enable/disable watches.
diff --git a/security/keys/compat.c b/security/keys/compat.c
index b975f8f..6ee9d8f 100644
--- a/security/keys/compat.c
+++ b/security/keys/compat.c
@@ -156,6 +156,9 @@
case KEYCTL_CAPABILITIES:
return keyctl_capabilities(compat_ptr(arg2), arg3);
+ case KEYCTL_WATCH_KEY:
+ return keyctl_watch_key(arg2, arg3, arg4);
+
default:
return -EOPNOTSUPP;
}
diff --git a/security/keys/gc.c b/security/keys/gc.c
index 671dd73..3c90807 100644
--- a/security/keys/gc.c
+++ b/security/keys/gc.c
@@ -131,6 +131,11 @@
kdebug("- %u", key->serial);
key_check(key);
+#ifdef CONFIG_KEY_NOTIFICATIONS
+ remove_watch_list(key->watchers, key->serial);
+ key->watchers = NULL;
+#endif
+
/* Throw away the key data if the key is instantiated */
if (state == KEY_IS_POSITIVE && key->type->destroy)
key->type->destroy(key);
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 153d35c..338a526 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -15,6 +15,7 @@
#include <linux/task_work.h>
#include <linux/keyctl.h>
#include <linux/refcount.h>
+#include <linux/watch_queue.h>
#include <linux/compat.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
@@ -99,7 +100,8 @@
const struct keyring_index_key *index_key,
struct assoc_array_edit **_edit);
extern int __key_link_check_live_key(struct key *keyring, struct key *key);
-extern void __key_link(struct key *key, struct assoc_array_edit **_edit);
+extern void __key_link(struct key *keyring, struct key *key,
+ struct assoc_array_edit **_edit);
extern void __key_link_end(struct key *keyring,
const struct keyring_index_key *index_key,
struct assoc_array_edit *edit);
@@ -165,7 +167,6 @@
const struct key_match_data *match_data);
#define KEY_LOOKUP_CREATE 0x01
#define KEY_LOOKUP_PARTIAL 0x02
-#define KEY_LOOKUP_FOR_UNLINK 0x04
extern long join_session_keyring(const char *name);
extern void key_change_session_keyring(struct callback_head *twork);
@@ -181,14 +182,32 @@
extern int key_task_permission(const key_ref_t key_ref,
const struct cred *cred,
- key_perm_t perm);
+ enum key_need_perm need_perm);
+
+static inline void notify_key(struct key *key,
+ enum key_notification_subtype subtype, u32 aux)
+{
+#ifdef CONFIG_KEY_NOTIFICATIONS
+ struct key_notification n = {
+ .watch.type = WATCH_TYPE_KEY_NOTIFY,
+ .watch.subtype = subtype,
+ .watch.info = watch_sizeof(n),
+ .key_id = key_serial(key),
+ .aux = aux,
+ };
+
+ post_watch_notification(key->watchers, &n.watch, current_cred(),
+ n.key_id);
+#endif
+}
/*
* Check to see whether permission is granted to use a key in the desired way.
*/
-static inline int key_permission(const key_ref_t key_ref, unsigned perm)
+static inline int key_permission(const key_ref_t key_ref,
+ enum key_need_perm need_perm)
{
- return key_task_permission(key_ref, current_cred(), perm);
+ return key_task_permission(key_ref, current_cred(), need_perm);
}
extern struct key_type key_type_request_key_auth;
@@ -333,6 +352,15 @@
extern long keyctl_capabilities(unsigned char __user *_buffer, size_t buflen);
+#ifdef CONFIG_KEY_NOTIFICATIONS
+extern long keyctl_watch_key(key_serial_t, int, int);
+#else
+static inline long keyctl_watch_key(key_serial_t key_id, int watch_fd, int watch_id)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
/*
* Debugging key validation
*/
diff --git a/security/keys/key.c b/security/keys/key.c
index e959b3c..e282c61 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -444,6 +444,7 @@
/* mark the key as being instantiated */
atomic_inc(&key->user->nikeys);
mark_key_instantiated(key, 0);
+ notify_key(key, NOTIFY_KEY_INSTANTIATED, 0);
if (test_and_clear_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags))
awaken = 1;
@@ -453,7 +454,7 @@
if (test_bit(KEY_FLAG_KEEP, &keyring->flags))
set_bit(KEY_FLAG_KEEP, &key->flags);
- __key_link(key, _edit);
+ __key_link(keyring, key, _edit);
}
/* disable the authorisation key */
@@ -601,6 +602,7 @@
/* mark the key as being negatively instantiated */
atomic_inc(&key->user->nikeys);
mark_key_instantiated(key, -error);
+ notify_key(key, NOTIFY_KEY_INSTANTIATED, -error);
key->expiry = ktime_get_real_seconds() + timeout;
key_schedule_gc(key->expiry + key_gc_delay);
@@ -611,7 +613,7 @@
/* and link it into the destination keyring */
if (keyring && link_ret == 0)
- __key_link(key, &edit);
+ __key_link(keyring, key, &edit);
/* disable the authorisation key */
if (authkey)
@@ -764,9 +766,11 @@
down_write(&key->sem);
ret = key->type->update(key, prep);
- if (ret == 0)
+ if (ret == 0) {
/* Updating a negative key positively instantiates it */
mark_key_instantiated(key, 0);
+ notify_key(key, NOTIFY_KEY_UPDATED, 0);
+ }
up_write(&key->sem);
@@ -1023,9 +1027,11 @@
down_write(&key->sem);
ret = key->type->update(key, &prep);
- if (ret == 0)
+ if (ret == 0) {
/* Updating a negative key positively instantiates it */
mark_key_instantiated(key, 0);
+ notify_key(key, NOTIFY_KEY_UPDATED, 0);
+ }
up_write(&key->sem);
@@ -1057,15 +1063,17 @@
* instantiated
*/
down_write_nested(&key->sem, 1);
- if (!test_and_set_bit(KEY_FLAG_REVOKED, &key->flags) &&
- key->type->revoke)
- key->type->revoke(key);
+ if (!test_and_set_bit(KEY_FLAG_REVOKED, &key->flags)) {
+ notify_key(key, NOTIFY_KEY_REVOKED, 0);
+ if (key->type->revoke)
+ key->type->revoke(key);
- /* set the death time to no more than the expiry time */
- time = ktime_get_real_seconds();
- if (key->revoked_at == 0 || key->revoked_at > time) {
- key->revoked_at = time;
- key_schedule_gc(key->revoked_at + key_gc_delay);
+ /* set the death time to no more than the expiry time */
+ time = ktime_get_real_seconds();
+ if (key->revoked_at == 0 || key->revoked_at > time) {
+ key->revoked_at = time;
+ key_schedule_gc(key->revoked_at + key_gc_delay);
+ }
}
up_write(&key->sem);
@@ -1087,8 +1095,10 @@
if (!test_bit(KEY_FLAG_INVALIDATED, &key->flags)) {
down_write_nested(&key->sem, 1);
- if (!test_and_set_bit(KEY_FLAG_INVALIDATED, &key->flags))
+ if (!test_and_set_bit(KEY_FLAG_INVALIDATED, &key->flags)) {
+ notify_key(key, NOTIFY_KEY_INVALIDATED, 0);
key_schedule_gc_links();
+ }
up_write(&key->sem);
}
}
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index e5ef20a..9febd37 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -37,7 +37,9 @@
KEYCTL_CAPS0_MOVE
),
[1] = (KEYCTL_CAPS1_NS_KEYRING_NAME |
- KEYCTL_CAPS1_NS_KEY_TAG),
+ KEYCTL_CAPS1_NS_KEY_TAG |
+ (IS_ENABLED(CONFIG_KEY_NOTIFICATIONS) ? KEYCTL_CAPS1_NOTIFICATIONS : 0)
+ ),
};
static int key_get_type_from_user(char *type,
@@ -429,7 +431,7 @@
/* Root is permitted to invalidate certain special keys */
if (capable(CAP_SYS_ADMIN)) {
- key_ref = lookup_user_key(id, 0, 0);
+ key_ref = lookup_user_key(id, 0, KEY_SYSADMIN_OVERRIDE);
if (IS_ERR(key_ref))
goto error;
if (test_bit(KEY_FLAG_ROOT_CAN_INVAL,
@@ -474,7 +476,8 @@
/* Root is permitted to invalidate certain special keyrings */
if (capable(CAP_SYS_ADMIN)) {
- keyring_ref = lookup_user_key(ringid, 0, 0);
+ keyring_ref = lookup_user_key(ringid, 0,
+ KEY_SYSADMIN_OVERRIDE);
if (IS_ERR(keyring_ref))
goto error;
if (test_bit(KEY_FLAG_ROOT_CAN_CLEAR,
@@ -558,7 +561,7 @@
goto error;
}
- key_ref = lookup_user_key(id, KEY_LOOKUP_FOR_UNLINK, 0);
+ key_ref = lookup_user_key(id, KEY_LOOKUP_PARTIAL, KEY_NEED_UNLINK);
if (IS_ERR(key_ref)) {
ret = PTR_ERR(key_ref);
goto error2;
@@ -658,7 +661,7 @@
key_put(instkey);
key_ref = lookup_user_key(keyid,
KEY_LOOKUP_PARTIAL,
- 0);
+ KEY_AUTHTOKEN_OVERRIDE);
if (!IS_ERR(key_ref))
goto okay;
}
@@ -828,7 +831,7 @@
size_t key_data_len;
/* find the key first */
- key_ref = lookup_user_key(keyid, 0, 0);
+ key_ref = lookup_user_key(keyid, 0, KEY_DEFER_PERM_CHECK);
if (IS_ERR(key_ref)) {
ret = -ENOKEY;
goto out;
@@ -1036,6 +1039,7 @@
if (group != (gid_t) -1)
key->gid = gid;
+ notify_key(key, NOTIFY_KEY_SETATTR, 0);
ret = 0;
error_put:
@@ -1086,6 +1090,7 @@
/* if we're not the sysadmin, we can only change a key that we own */
if (capable(CAP_SYS_ADMIN) || uid_eq(key->uid, current_fsuid())) {
key->perm = perm;
+ notify_key(key, NOTIFY_KEY_SETATTR, 0);
ret = 0;
}
@@ -1461,7 +1466,7 @@
key_put(instkey);
key_ref = lookup_user_key(id,
KEY_LOOKUP_PARTIAL,
- 0);
+ KEY_AUTHTOKEN_OVERRIDE);
if (!IS_ERR(key_ref))
goto okay;
}
@@ -1474,10 +1479,12 @@
okay:
key = key_ref_to_ptr(key_ref);
ret = 0;
- if (test_bit(KEY_FLAG_KEEP, &key->flags))
+ if (test_bit(KEY_FLAG_KEEP, &key->flags)) {
ret = -EPERM;
- else
+ } else {
key_set_timeout(key, timeout);
+ notify_key(key, NOTIFY_KEY_SETATTR, 0);
+ }
key_put(key);
error:
@@ -1567,7 +1574,8 @@
return PTR_ERR(instkey);
key_put(instkey);
- key_ref = lookup_user_key(keyid, KEY_LOOKUP_PARTIAL, 0);
+ key_ref = lookup_user_key(keyid, KEY_LOOKUP_PARTIAL,
+ KEY_AUTHTOKEN_OVERRIDE);
if (IS_ERR(key_ref))
return PTR_ERR(key_ref);
}
@@ -1751,6 +1759,90 @@
return ret;
}
+#ifdef CONFIG_KEY_NOTIFICATIONS
+/*
+ * Watch for changes to a key.
+ *
+ * The caller must have View permission to watch a key or keyring.
+ */
+long keyctl_watch_key(key_serial_t id, int watch_queue_fd, int watch_id)
+{
+ struct watch_queue *wqueue;
+ struct watch_list *wlist = NULL;
+ struct watch *watch = NULL;
+ struct key *key;
+ key_ref_t key_ref;
+ long ret;
+
+ if (watch_id < -1 || watch_id > 0xff)
+ return -EINVAL;
+
+ key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE, KEY_NEED_VIEW);
+ if (IS_ERR(key_ref))
+ return PTR_ERR(key_ref);
+ key = key_ref_to_ptr(key_ref);
+
+ wqueue = get_watch_queue(watch_queue_fd);
+ if (IS_ERR(wqueue)) {
+ ret = PTR_ERR(wqueue);
+ goto err_key;
+ }
+
+ if (watch_id >= 0) {
+ ret = -ENOMEM;
+ if (!key->watchers) {
+ wlist = kzalloc(sizeof(*wlist), GFP_KERNEL);
+ if (!wlist)
+ goto err_wqueue;
+ init_watch_list(wlist, NULL);
+ }
+
+ watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+ if (!watch)
+ goto err_wlist;
+
+ init_watch(watch, wqueue);
+ watch->id = key->serial;
+ watch->info_id = (u32)watch_id << WATCH_INFO_ID__SHIFT;
+
+ ret = security_watch_key(key);
+ if (ret < 0)
+ goto err_watch;
+
+ down_write(&key->sem);
+ if (!key->watchers) {
+ key->watchers = wlist;
+ wlist = NULL;
+ }
+
+ ret = add_watch_to_object(watch, key->watchers);
+ up_write(&key->sem);
+
+ if (ret == 0)
+ watch = NULL;
+ } else {
+ ret = -EBADSLT;
+ if (key->watchers) {
+ down_write(&key->sem);
+ ret = remove_watch_from_object(key->watchers,
+ wqueue, key_serial(key),
+ false);
+ up_write(&key->sem);
+ }
+ }
+
+err_watch:
+ kfree(watch);
+err_wlist:
+ kfree(wlist);
+err_wqueue:
+ put_watch_queue(wqueue);
+err_key:
+ key_put(key);
+ return ret;
+}
+#endif /* CONFIG_KEY_NOTIFICATIONS */
+
/*
* Get keyrings subsystem capabilities.
*/
@@ -1920,6 +2012,9 @@
case KEYCTL_CAPABILITIES:
return keyctl_capabilities((unsigned char __user *)arg2, (size_t)arg3);
+ case KEYCTL_WATCH_KEY:
+ return keyctl_watch_key((key_serial_t)arg2, (int)arg3, (int)arg4);
+
default:
return -EOPNOTSUPP;
}
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index 5ca620d..14abfe7 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -1056,12 +1056,14 @@
down_write(&keyring->sem);
down_write(&keyring_serialise_restrict_sem);
- if (keyring->restrict_link)
+ if (keyring->restrict_link) {
ret = -EEXIST;
- else if (keyring_detect_restriction_cycle(keyring, restrict_link))
+ } else if (keyring_detect_restriction_cycle(keyring, restrict_link)) {
ret = -EDEADLK;
- else
+ } else {
keyring->restrict_link = restrict_link;
+ notify_key(keyring, NOTIFY_KEY_SETATTR, 0);
+ }
up_write(&keyring_serialise_restrict_sem);
up_write(&keyring->sem);
@@ -1362,12 +1364,14 @@
* holds at most one link to any given key of a particular type+description
* combination.
*/
-void __key_link(struct key *key, struct assoc_array_edit **_edit)
+void __key_link(struct key *keyring, struct key *key,
+ struct assoc_array_edit **_edit)
{
__key_get(key);
assoc_array_insert_set_object(*_edit, keyring_key_to_ptr(key));
assoc_array_apply_edit(*_edit);
*_edit = NULL;
+ notify_key(keyring, NOTIFY_KEY_LINKED, key_serial(key));
}
/*
@@ -1451,7 +1455,7 @@
if (ret == 0)
ret = __key_link_check_live_key(keyring, key);
if (ret == 0)
- __key_link(key, &edit);
+ __key_link(keyring, key, &edit);
error_end:
__key_link_end(keyring, &key->index_key, edit);
@@ -1483,7 +1487,7 @@
struct assoc_array_edit *edit;
BUG_ON(*_edit != NULL);
-
+
edit = assoc_array_delete(&keyring->keys, &keyring_assoc_array_ops,
&key->index_key);
if (IS_ERR(edit))
@@ -1503,6 +1507,7 @@
struct assoc_array_edit **_edit)
{
assoc_array_apply_edit(*_edit);
+ notify_key(keyring, NOTIFY_KEY_UNLINKED, key_serial(key));
*_edit = NULL;
key_payload_reserve(keyring, keyring->datalen - KEYQUOTA_LINK_BYTES);
}
@@ -1621,7 +1626,7 @@
goto error;
__key_unlink(from_keyring, key, &from_edit);
- __key_link(key, &to_edit);
+ __key_link(to_keyring, key, &to_edit);
error:
__key_link_end(to_keyring, &key->index_key, to_edit);
__key_unlink_end(from_keyring, key, from_edit);
@@ -1655,6 +1660,7 @@
} else {
if (edit)
assoc_array_apply_edit(edit);
+ notify_key(keyring, NOTIFY_KEY_CLEARED, 0);
key_payload_reserve(keyring, 0);
ret = 0;
}
diff --git a/security/keys/permission.c b/security/keys/permission.c
index 085f907..4a61f80 100644
--- a/security/keys/permission.c
+++ b/security/keys/permission.c
@@ -13,7 +13,7 @@
* key_task_permission - Check a key can be used
* @key_ref: The key to check.
* @cred: The credentials to use.
- * @perm: The permissions to check for.
+ * @need_perm: The permission required.
*
* Check to see whether permission is granted to use a key in the desired way,
* but permit the security modules to override.
@@ -24,12 +24,30 @@
* permissions bits or the LSM check.
*/
int key_task_permission(const key_ref_t key_ref, const struct cred *cred,
- unsigned perm)
+ enum key_need_perm need_perm)
{
struct key *key;
- key_perm_t kperm;
+ key_perm_t kperm, mask;
int ret;
+ switch (need_perm) {
+ default:
+ WARN_ON(1);
+ return -EACCES;
+ case KEY_NEED_UNLINK:
+ case KEY_SYSADMIN_OVERRIDE:
+ case KEY_AUTHTOKEN_OVERRIDE:
+ case KEY_DEFER_PERM_CHECK:
+ goto lsm;
+
+ case KEY_NEED_VIEW: mask = KEY_OTH_VIEW; break;
+ case KEY_NEED_READ: mask = KEY_OTH_READ; break;
+ case KEY_NEED_WRITE: mask = KEY_OTH_WRITE; break;
+ case KEY_NEED_SEARCH: mask = KEY_OTH_SEARCH; break;
+ case KEY_NEED_LINK: mask = KEY_OTH_LINK; break;
+ case KEY_NEED_SETATTR: mask = KEY_OTH_SETATTR; break;
+ }
+
key = key_ref_to_ptr(key_ref);
/* use the second 8-bits of permissions for keys the caller owns */
@@ -64,13 +82,12 @@
if (is_key_possessed(key_ref))
kperm |= key->perm >> 24;
- kperm = kperm & perm & KEY_NEED_ALL;
-
- if (kperm != perm)
+ if ((kperm & mask) != mask)
return -EACCES;
/* let LSM be the final arbiter */
- return security_key_permission(key_ref, cred, perm);
+lsm:
+ return security_key_permission(key_ref, cred, need_perm);
}
EXPORT_SYMBOL(key_task_permission);
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 09541de..7e0232d 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -609,7 +609,7 @@
* returned key reference.
*/
key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags,
- key_perm_t perm)
+ enum key_need_perm need_perm)
{
struct keyring_search_context ctx = {
.match_data.cmp = lookup_user_key_possessed,
@@ -773,35 +773,33 @@
/* unlink does not use the nominated key in any way, so can skip all
* the permission checks as it is only concerned with the keyring */
- if (lflags & KEY_LOOKUP_FOR_UNLINK) {
- ret = 0;
- goto error;
- }
-
- if (!(lflags & KEY_LOOKUP_PARTIAL)) {
- ret = wait_for_key_construction(key, true);
- switch (ret) {
- case -ERESTARTSYS:
- goto invalid_key;
- default:
- if (perm)
+ if (need_perm != KEY_NEED_UNLINK) {
+ if (!(lflags & KEY_LOOKUP_PARTIAL)) {
+ ret = wait_for_key_construction(key, true);
+ switch (ret) {
+ case -ERESTARTSYS:
goto invalid_key;
- case 0:
- break;
+ default:
+ if (need_perm != KEY_AUTHTOKEN_OVERRIDE &&
+ need_perm != KEY_DEFER_PERM_CHECK)
+ goto invalid_key;
+ case 0:
+ break;
+ }
+ } else if (need_perm != KEY_DEFER_PERM_CHECK) {
+ ret = key_validate(key);
+ if (ret < 0)
+ goto invalid_key;
}
- } else if (perm) {
- ret = key_validate(key);
- if (ret < 0)
+
+ ret = -EIO;
+ if (!(lflags & KEY_LOOKUP_PARTIAL) &&
+ key_read_state(key) == KEY_IS_UNINSTANTIATED)
goto invalid_key;
}
- ret = -EIO;
- if (!(lflags & KEY_LOOKUP_PARTIAL) &&
- key_read_state(key) == KEY_IS_UNINSTANTIATED)
- goto invalid_key;
-
/* check the permissions */
- ret = key_task_permission(key_ref, ctx.cred, perm);
+ ret = key_task_permission(key_ref, ctx.cred, need_perm);
if (ret < 0)
goto invalid_key;
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 957b9e3..e1b9f1a 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -418,7 +418,7 @@
goto key_already_present;
if (dest_keyring)
- __key_link(key, &edit);
+ __key_link(dest_keyring, key, &edit);
mutex_unlock(&key_construction_mutex);
if (dest_keyring)
@@ -437,7 +437,7 @@
if (dest_keyring) {
ret = __key_link_check_live_key(dest_keyring, key);
if (ret == 0)
- __key_link(key, &edit);
+ __key_link(dest_keyring, key, &edit);
__key_link_end(dest_keyring, &ctx->index_key, edit);
if (ret < 0)
goto link_check_failed;
diff --git a/security/security.c b/security/security.c
index e0290b7..2a652e5 100644
--- a/security/security.c
+++ b/security/security.c
@@ -2030,6 +2030,22 @@
}
EXPORT_SYMBOL(security_inode_getsecctx);
+#ifdef CONFIG_WATCH_QUEUE
+int security_post_notification(const struct cred *w_cred,
+ const struct cred *cred,
+ struct watch_notification *n)
+{
+ return call_int_hook(post_notification, 0, w_cred, cred, n);
+}
+#endif /* CONFIG_WATCH_QUEUE */
+
+#ifdef CONFIG_KEY_NOTIFICATIONS
+int security_watch_key(struct key *key)
+{
+ return call_int_hook(watch_key, 0, key);
+}
+#endif
+
#ifdef CONFIG_SECURITY_NETWORK
int security_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk)
@@ -2405,10 +2421,10 @@
call_void_hook(key_free, key);
}
-int security_key_permission(key_ref_t key_ref,
- const struct cred *cred, unsigned perm)
+int security_key_permission(key_ref_t key_ref, const struct cred *cred,
+ enum key_need_perm need_perm)
{
- return call_int_hook(key_permission, 0, key_ref, cred, perm);
+ return call_int_hook(key_permission, 0, key_ref, cred, need_perm);
}
int security_key_getsecurity(struct key *key, char **_buffer)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 7e954b5..efa6108 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -6559,20 +6559,43 @@
static int selinux_key_permission(key_ref_t key_ref,
const struct cred *cred,
- unsigned perm)
+ enum key_need_perm need_perm)
{
struct key *key;
struct key_security_struct *ksec;
- u32 sid;
+ u32 perm, sid;
- /* if no specific permissions are requested, we skip the
- permission check. No serious, additional covert channels
- appear to be created. */
- if (perm == 0)
+ switch (need_perm) {
+ case KEY_NEED_VIEW:
+ perm = KEY__VIEW;
+ break;
+ case KEY_NEED_READ:
+ perm = KEY__READ;
+ break;
+ case KEY_NEED_WRITE:
+ perm = KEY__WRITE;
+ break;
+ case KEY_NEED_SEARCH:
+ perm = KEY__SEARCH;
+ break;
+ case KEY_NEED_LINK:
+ perm = KEY__LINK;
+ break;
+ case KEY_NEED_SETATTR:
+ perm = KEY__SETATTR;
+ break;
+ case KEY_NEED_UNLINK:
+ case KEY_SYSADMIN_OVERRIDE:
+ case KEY_AUTHTOKEN_OVERRIDE:
+ case KEY_DEFER_PERM_CHECK:
return 0;
+ default:
+ WARN_ON(1);
+ return -EPERM;
+
+ }
sid = cred_sid(cred);
-
key = key_ref_to_ptr(key_ref);
ksec = key->security;
@@ -6594,6 +6617,17 @@
*_buffer = context;
return rc;
}
+
+#ifdef CONFIG_KEY_NOTIFICATIONS
+static int selinux_watch_key(struct key *key)
+{
+ struct key_security_struct *ksec = key->security;
+ u32 sid = current_sid();
+
+ return avc_has_perm(&selinux_state,
+ sid, ksec->sid, SECCLASS_KEY, KEY__VIEW, NULL);
+}
+#endif
#endif
#ifdef CONFIG_SECURITY_INFINIBAND
@@ -7109,6 +7143,9 @@
LSM_HOOK_INIT(key_free, selinux_key_free),
LSM_HOOK_INIT(key_permission, selinux_key_permission),
LSM_HOOK_INIT(key_getsecurity, selinux_key_getsecurity),
+#ifdef CONFIG_KEY_NOTIFICATIONS
+ LSM_HOOK_INIT(watch_key, selinux_watch_key),
+#endif
#endif
#ifdef CONFIG_AUDIT
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index cd44b79..8ffbf95 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -41,6 +41,7 @@
#include <linux/parser.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
+#include <linux/watch_queue.h>
#include "smack.h"
#define TRANS_TRUE "TRUE"
@@ -4213,13 +4214,14 @@
* smack_key_permission - Smack access on a key
* @key_ref: gets to the object
* @cred: the credentials to use
- * @perm: requested key permissions
+ * @need_perm: requested key permission
*
* Return 0 if the task has read and write to the object,
* an error code otherwise
*/
static int smack_key_permission(key_ref_t key_ref,
- const struct cred *cred, unsigned perm)
+ const struct cred *cred,
+ enum key_need_perm need_perm)
{
struct key *keyp;
struct smk_audit_info ad;
@@ -4230,8 +4232,26 @@
/*
* Validate requested permissions
*/
- if (perm & ~KEY_NEED_ALL)
+ switch (need_perm) {
+ case KEY_NEED_READ:
+ case KEY_NEED_SEARCH:
+ case KEY_NEED_VIEW:
+ request |= MAY_READ;
+ break;
+ case KEY_NEED_WRITE:
+ case KEY_NEED_LINK:
+ case KEY_NEED_SETATTR:
+ request |= MAY_WRITE;
+ break;
+ case KEY_NEED_UNSPECIFIED:
+ case KEY_NEED_UNLINK:
+ case KEY_SYSADMIN_OVERRIDE:
+ case KEY_AUTHTOKEN_OVERRIDE:
+ case KEY_DEFER_PERM_CHECK:
+ return 0;
+ default:
return -EINVAL;
+ }
keyp = key_ref_to_ptr(key_ref);
if (keyp == NULL)
@@ -4248,7 +4268,7 @@
if (tkp == NULL)
return -EACCES;
- if (smack_privileged_cred(CAP_MAC_OVERRIDE, cred))
+ if (smack_privileged(CAP_MAC_OVERRIDE))
return 0;
#ifdef CONFIG_AUDIT
@@ -4256,10 +4276,6 @@
ad.a.u.key_struct.key = keyp->serial;
ad.a.u.key_struct.key_desc = keyp->description;
#endif
- if (perm & (KEY_NEED_READ | KEY_NEED_SEARCH | KEY_NEED_VIEW))
- request |= MAY_READ;
- if (perm & (KEY_NEED_WRITE | KEY_NEED_LINK | KEY_NEED_SETATTR))
- request |= MAY_WRITE;
rc = smk_access(tkp, keyp->security, request, &ad);
rc = smk_bu_note("key access", tkp, keyp->security, request, rc);
return rc;
@@ -4294,8 +4310,81 @@
return length;
}
+
+#ifdef CONFIG_KEY_NOTIFICATIONS
+/**
+ * smack_watch_key - Smack access to watch a key for notifications.
+ * @key: The key to be watched
+ *
+ * Return 0 if the @watch->cred has permission to read from the key object and
+ * an error otherwise.
+ */
+static int smack_watch_key(struct key *key)
+{
+ struct smk_audit_info ad;
+ struct smack_known *tkp = smk_of_current();
+ int rc;
+
+ if (key == NULL)
+ return -EINVAL;
+ /*
+ * If the key hasn't been initialized give it access so that
+ * it may do so.
+ */
+ if (key->security == NULL)
+ return 0;
+ /*
+ * This should not occur
+ */
+ if (tkp == NULL)
+ return -EACCES;
+
+ if (smack_privileged_cred(CAP_MAC_OVERRIDE, current_cred()))
+ return 0;
+
+#ifdef CONFIG_AUDIT
+ smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_KEY);
+ ad.a.u.key_struct.key = key->serial;
+ ad.a.u.key_struct.key_desc = key->description;
+#endif
+ rc = smk_access(tkp, key->security, MAY_READ, &ad);
+ rc = smk_bu_note("key watch", tkp, key->security, MAY_READ, rc);
+ return rc;
+}
+#endif /* CONFIG_KEY_NOTIFICATIONS */
#endif /* CONFIG_KEYS */
+#ifdef CONFIG_WATCH_QUEUE
+/**
+ * smack_post_notification - Smack access to post a notification to a queue
+ * @w_cred: The credentials of the watcher.
+ * @cred: The credentials of the event source (may be NULL).
+ * @n: The notification message to be posted.
+ */
+static int smack_post_notification(const struct cred *w_cred,
+ const struct cred *cred,
+ struct watch_notification *n)
+{
+ struct smk_audit_info ad;
+ struct smack_known *subj, *obj;
+ int rc;
+
+ /* Always let maintenance notifications through. */
+ if (n->type == WATCH_TYPE_META)
+ return 0;
+
+ if (!cred)
+ return 0;
+ subj = smk_of_task(smack_cred(cred));
+ obj = smk_of_task(smack_cred(w_cred));
+
+ smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NOTIFICATION);
+ rc = smk_access(subj, obj, MAY_WRITE, &ad);
+ rc = smk_bu_note("notification", subj, obj, MAY_WRITE, rc);
+ return rc;
+}
+#endif /* CONFIG_WATCH_QUEUE */
+
/*
* Smack Audit hooks
*
@@ -4684,8 +4773,15 @@
LSM_HOOK_INIT(key_free, smack_key_free),
LSM_HOOK_INIT(key_permission, smack_key_permission),
LSM_HOOK_INIT(key_getsecurity, smack_key_getsecurity),
+#ifdef CONFIG_KEY_NOTIFICATIONS
+ LSM_HOOK_INIT(watch_key, smack_watch_key),
+#endif
#endif /* CONFIG_KEYS */
+#ifdef CONFIG_WATCH_QUEUE
+ LSM_HOOK_INIT(post_notification, smack_post_notification),
+#endif
+
/* Audit hooks */
#ifdef CONFIG_AUDIT
LSM_HOOK_INIT(audit_rule_init, smack_audit_rule_init),