Merge tag '9p-for-6.9-rc3' of https://github.com/martinetd/linux
Pull minor 9p cleanups from Dominique Martinet:
- kernel doc fix & removal of unused flag
- fix some bogus debug statement for read/write
* tag '9p-for-6.9-rc3' of https://github.com/martinetd/linux:
9p: remove SLAB_MEM_SPREAD flag usage
9p: Fix read/write debug statements to report server reply
9p/trans_fd: remove Excess kernel-doc comment
diff --git a/.get_maintainer.ignore b/.get_maintainer.ignore
index c298bab..7d1b30a 100644
--- a/.get_maintainer.ignore
+++ b/.get_maintainer.ignore
@@ -1,4 +1,5 @@
Alan Cox <alan@lxorguk.ukuu.org.uk>
Alan Cox <root@hraefn.swansea.linux.org.uk>
Christoph Hellwig <hch@lst.de>
+Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Marc Gonzalez <marc.w.gonzalez@free.fr>
diff --git a/.gitignore b/.gitignore
index 689a4fa..c59dc60 100644
--- a/.gitignore
+++ b/.gitignore
@@ -52,6 +52,7 @@
*.xz
*.zst
Module.symvers
+dtbs-list
modules.order
#
diff --git a/.mailmap b/.mailmap
index ee8f03c..59c9a84 100644
--- a/.mailmap
+++ b/.mailmap
@@ -191,10 +191,11 @@
Gao Xiang <xiang@kernel.org> <hsiangkao@aol.com>
Gao Xiang <xiang@kernel.org> <hsiangkao@linux.alibaba.com>
Gao Xiang <xiang@kernel.org> <hsiangkao@redhat.com>
-Geliang Tang <geliang.tang@linux.dev> <geliang.tang@suse.com>
-Geliang Tang <geliang.tang@linux.dev> <geliangtang@xiaomi.com>
-Geliang Tang <geliang.tang@linux.dev> <geliangtang@gmail.com>
-Geliang Tang <geliang.tang@linux.dev> <geliangtang@163.com>
+Geliang Tang <geliang@kernel.org> <geliang.tang@linux.dev>
+Geliang Tang <geliang@kernel.org> <geliang.tang@suse.com>
+Geliang Tang <geliang@kernel.org> <geliangtang@xiaomi.com>
+Geliang Tang <geliang@kernel.org> <geliangtang@gmail.com>
+Geliang Tang <geliang@kernel.org> <geliangtang@163.com>
Georgi Djakov <djakov@kernel.org> <georgi.djakov@linaro.org>
Gerald Schaefer <gerald.schaefer@linux.ibm.com> <geraldsc@de.ibm.com>
Gerald Schaefer <gerald.schaefer@linux.ibm.com> <gerald.schaefer@de.ibm.com>
@@ -324,6 +325,7 @@
Kenneth Westfield <quic_kwestfie@quicinc.com> <kwestfie@codeaurora.org>
Kiran Gunda <quic_kgunda@quicinc.com> <kgunda@codeaurora.org>
Kirill Tkhai <tkhai@ya.ru> <ktkhai@virtuozzo.com>
+Kishon Vijay Abraham I <kishon@kernel.org> <kishon@ti.com>
Konstantin Khlebnikov <koct9i@gmail.com> <khlebnikov@yandex-team.ru>
Konstantin Khlebnikov <koct9i@gmail.com> <k.khlebnikov@samsung.com>
Koushik <raghavendra.koushik@neterion.com>
@@ -338,7 +340,8 @@
Lee Jones <lee@kernel.org> <lee.jones@canonical.com>
Lee Jones <lee@kernel.org> <lee.jones@linaro.org>
Lee Jones <lee@kernel.org> <lee@ubuntu.com>
-Leonard Crestez <leonard.crestez@nxp.com> Leonard Crestez <cdleonard@gmail.com>
+Leonard Crestez <cdleonard@gmail.com> <leonard.crestez@nxp.com>
+Leonard Crestez <cdleonard@gmail.com> <leonard.crestez@intel.com>
Leonardo Bras <leobras.c@gmail.com> <leonardo@linux.ibm.com>
Leonard Göhrs <l.goehrs@pengutronix.de>
Leonid I Ananiev <leonid.i.ananiev@intel.com>
@@ -437,6 +440,8 @@
Muna Sinada <quic_msinada@quicinc.com> <msinada@codeaurora.org>
Murali Nalajala <quic_mnalajal@quicinc.com> <mnalajal@codeaurora.org>
Mythri P K <mythripk@ti.com>
+Nadav Amit <nadav.amit@gmail.com> <namit@vmware.com>
+Nadav Amit <nadav.amit@gmail.com> <namit@cs.technion.ac.il>
Nadia Yvette Chambers <nyc@holomorphy.com> William Lee Irwin III <wli@holomorphy.com>
Naoya Horiguchi <naoya.horiguchi@nec.com> <n-horiguchi@ah.jp.nec.com>
Nathan Chancellor <nathan@kernel.org> <natechancellor@gmail.com>
@@ -493,7 +498,8 @@
Punit Agrawal <punitagrawal@gmail.com> <punit.agrawal@arm.com>
Qais Yousef <qyousef@layalina.io> <qais.yousef@imgtec.com>
Qais Yousef <qyousef@layalina.io> <qais.yousef@arm.com>
-Quentin Monnet <quentin@isovalent.com> <quentin.monnet@netronome.com>
+Quentin Monnet <qmo@kernel.org> <quentin.monnet@netronome.com>
+Quentin Monnet <qmo@kernel.org> <quentin@isovalent.com>
Quentin Perret <qperret@qperret.net> <quentin.perret@arm.com>
Rafael J. Wysocki <rjw@rjwysocki.net> <rjw@sisk.pl>
Rajeev Nandan <quic_rajeevny@quicinc.com> <rajeevny@codeaurora.org>
@@ -552,6 +558,7 @@
Serge Hallyn <sergeh@kernel.org> <serge.hallyn@canonical.com>
Serge Hallyn <sergeh@kernel.org> <serue@us.ibm.com>
Seth Forshee <sforshee@kernel.org> <seth.forshee@canonical.com>
+Shakeel Butt <shakeel.butt@linux.dev> <shakeelb@google.com>
Shannon Nelson <shannon.nelson@amd.com> <snelson@pensando.io>
Shannon Nelson <shannon.nelson@amd.com> <shannon.nelson@intel.com>
Shannon Nelson <shannon.nelson@amd.com> <shannon.nelson@oracle.com>
@@ -570,6 +577,7 @@
Sricharan Ramabadhran <quic_srichara@quicinc.com> <sricharan@codeaurora.org>
Srinivas Ramana <quic_sramana@quicinc.com> <sramana@codeaurora.org>
Sriram R <quic_srirrama@quicinc.com> <srirrama@codeaurora.org>
+Stefan Wahren <wahrenst@gmx.net> <stefan.wahren@i2se.com>
Stéphane Witzmann <stephane.witzmann@ubpmes.univ-bpclermont.fr>
Stephen Hemminger <stephen@networkplumber.org> <shemminger@linux-foundation.org>
Stephen Hemminger <stephen@networkplumber.org> <shemminger@osdl.org>
@@ -607,6 +615,11 @@
TripleX Chung <xxx.phy@gmail.com> <zhongyu@18mail.cn>
Tsuneo Yoshioka <Tsuneo.Yoshioka@f-secure.com>
Tudor Ambarus <tudor.ambarus@linaro.org> <tudor.ambarus@microchip.com>
+Tvrtko Ursulin <tursulin@ursulin.net> <tvrtko.ursulin@intel.com>
+Tvrtko Ursulin <tursulin@ursulin.net> <tvrtko.ursulin@linux.intel.com>
+Tvrtko Ursulin <tursulin@ursulin.net> <tvrtko.ursulin@sophos.com>
+Tvrtko Ursulin <tursulin@ursulin.net> <tvrtko.ursulin@onelan.co.uk>
+Tvrtko Ursulin <tursulin@ursulin.net> <tvrtko@ursulin.net>
Tycho Andersen <tycho@tycho.pizza> <tycho@tycho.ws>
Tzung-Bi Shih <tzungbi@kernel.org> <tzungbi@google.com>
Uwe Kleine-König <ukleinek@informatik.uni-freiburg.de>
diff --git a/CREDITS b/CREDITS
index df8d694..c55c5a0 100644
--- a/CREDITS
+++ b/CREDITS
@@ -63,6 +63,11 @@
S: Buenos Aires
S: Argentina
+NTFS FILESYSTEM
+N: Anton Altaparmakov
+E: anton@tuxera.com
+D: NTFS filesystem
+
N: Tim Alpaerts
E: tim_alpaerts@toyota-motor-europe.com
D: 802.2 class II logical link control layer,
@@ -2955,6 +2960,11 @@
S: Reston, Virginia 20191
S: USA
+N: Sekhar Nori
+E: nori.sekhar@gmail.com
+D: Maintainer of Texas Instruments DaVinci machine support, contributor
+D: to device drivers relevant to that SoC family.
+
N: Fredrik Noring
E: noring@nocrew.org
W: http://www.lysator.liu.se/~noring/
diff --git a/Documentation/ABI/obsolete/sysfs-gpio b/Documentation/ABI/obsolete/sysfs-gpio
index b8b0fd3..da1345d 100644
--- a/Documentation/ABI/obsolete/sysfs-gpio
+++ b/Documentation/ABI/obsolete/sysfs-gpio
@@ -28,5 +28,5 @@
/label ... (r/o) descriptive, not necessarily unique
/ngpio ... (r/o) number of GPIOs; numbered N to N + (ngpio - 1)
- This ABI is deprecated and will be removed after 2020. It is
- replaced with the GPIO character device.
+ This ABI is obsoleted by Documentation/ABI/testing/gpio-cdev and will be
+ removed after 2020.
diff --git a/Documentation/ABI/testing/configfs-usb-gadget-ffs b/Documentation/ABI/testing/configfs-usb-gadget-ffs
index e39b276..bf8936f 100644
--- a/Documentation/ABI/testing/configfs-usb-gadget-ffs
+++ b/Documentation/ABI/testing/configfs-usb-gadget-ffs
@@ -4,6 +4,14 @@
Description: The purpose of this directory is to create and remove it.
A corresponding USB function instance is created/removed.
- There are no attributes here.
- All parameters are set through FunctionFS.
+ All attributes are read only:
+
+ ============= ============================================
+ ready 1 if the function is ready to be used, E.G.
+ if userspace has written descriptors and
+ strings to ep0, so the gadget can be
+ enabled - 0 otherwise.
+ ============= ============================================
+
+ All other parameters are set through FunctionFS.
diff --git a/Documentation/ABI/testing/debugfs-cxl b/Documentation/ABI/testing/debugfs-cxl
index fe61d37..c61f9b8 100644
--- a/Documentation/ABI/testing/debugfs-cxl
+++ b/Documentation/ABI/testing/debugfs-cxl
@@ -33,3 +33,37 @@
device cannot clear poison from the address, -ENXIO is returned.
The clear_poison attribute is only visible for devices
supporting the capability.
+
+What: /sys/kernel/debug/cxl/einj_types
+Date: January, 2024
+KernelVersion: v6.9
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (RO) Prints the CXL protocol error types made available by
+ the platform in the format:
+
+ 0x<error number> <error type>
+
+ The possible error types are (as of ACPI v6.5):
+
+ 0x1000 CXL.cache Protocol Correctable
+ 0x2000 CXL.cache Protocol Uncorrectable non-fatal
+ 0x4000 CXL.cache Protocol Uncorrectable fatal
+ 0x8000 CXL.mem Protocol Correctable
+ 0x10000 CXL.mem Protocol Uncorrectable non-fatal
+ 0x20000 CXL.mem Protocol Uncorrectable fatal
+
+ The <error number> can be written to einj_inject to inject
+ <error type> into a chosen dport.
+
+What: /sys/kernel/debug/cxl/$dport_dev/einj_inject
+Date: January, 2024
+KernelVersion: v6.9
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (WO) Writing an integer to this file injects the corresponding
+ CXL protocol error into $dport_dev ($dport_dev will be a device
+ name from /sys/bus/pci/devices). The integer to type mapping for
+ injection can be found by reading from einj_types. If the dport
+ was enumerated in RCH mode, a CXL 1.1 error is injected, otherwise
+ a CXL 2.0 error is injected.
diff --git a/Documentation/ABI/testing/debugfs-driver-qat b/Documentation/ABI/testing/debugfs-driver-qat
index b2db010..bd67937 100644
--- a/Documentation/ABI/testing/debugfs-driver-qat
+++ b/Documentation/ABI/testing/debugfs-driver-qat
@@ -81,3 +81,29 @@
<N>: Number of Compress and Verify (CnV) errors and type
of the last CnV error detected by Acceleration
Engine N.
+
+What: /sys/kernel/debug/qat_<device>_<BDF>/heartbeat/inject_error
+Date: March 2024
+KernelVersion: 6.8
+Contact: qat-linux@intel.com
+Description: (WO) Write to inject an error that simulates an heartbeat
+ failure. This is to be used for testing purposes.
+
+ After writing this file, the driver stops arbitration on a
+ random engine and disables the fetching of heartbeat counters.
+ If a workload is running on the device, a job submitted to the
+ accelerator might not get a response and a read of the
+ `heartbeat/status` attribute might report -1, i.e. device
+ unresponsive.
+ The error is unrecoverable thus the device must be restarted to
+ restore its functionality.
+
+ This attribute is available only when the kernel is built with
+ CONFIG_CRYPTO_DEV_QAT_ERROR_INJECTION=y.
+
+ A write of 1 enables error injection.
+
+ The following example shows how to enable error injection::
+
+ # cd /sys/kernel/debug/qat_<device>_<BDF>
+ # echo 1 > heartbeat/inject_error
diff --git a/Documentation/ABI/testing/debugfs-hisi-hpre b/Documentation/ABI/testing/debugfs-hisi-hpre
index 8e8de49..d4e16ef 100644
--- a/Documentation/ABI/testing/debugfs-hisi-hpre
+++ b/Documentation/ABI/testing/debugfs-hisi-hpre
@@ -111,6 +111,28 @@
node is used to show the change of the qm register values. This
node can be help users to check the change of register values.
+What: /sys/kernel/debug/hisi_hpre/<bdf>/qm/qm_state
+Date: Jan 2024
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the state of the device.
+ 0: busy, 1: idle.
+ Only available for PF, and take no other effect on HPRE.
+
+What: /sys/kernel/debug/hisi_hpre/<bdf>/qm/dev_timeout
+Date: Feb 2024
+Contact: linux-crypto@vger.kernel.org
+Description: Set the wait time when stop queue fails. Available for both PF
+ and VF, and take no other effect on HPRE.
+ 0: not wait(default), others value: wait dev_timeout * 20 microsecond.
+
+What: /sys/kernel/debug/hisi_hpre/<bdf>/qm/dev_state
+Date: Feb 2024
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the stop queue status of the QM. The default value is 0,
+ if dev_timeout is set, when stop queue fails, the dev_state
+ will return non-zero value. Available for both PF and VF,
+ and take no other effect on HPRE.
+
What: /sys/kernel/debug/hisi_hpre/<bdf>/hpre_dfx/diff_regs
Date: Mar 2022
Contact: linux-crypto@vger.kernel.org
diff --git a/Documentation/ABI/testing/debugfs-hisi-sec b/Documentation/ABI/testing/debugfs-hisi-sec
index deeefe2..6c6c9a6 100644
--- a/Documentation/ABI/testing/debugfs-hisi-sec
+++ b/Documentation/ABI/testing/debugfs-hisi-sec
@@ -91,6 +91,28 @@
node is used to show the change of the qm register values. This
node can be help users to check the change of register values.
+What: /sys/kernel/debug/hisi_sec2/<bdf>/qm/qm_state
+Date: Jan 2024
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the state of the device.
+ 0: busy, 1: idle.
+ Only available for PF, and take no other effect on SEC.
+
+What: /sys/kernel/debug/hisi_sec2/<bdf>/qm/dev_timeout
+Date: Feb 2024
+Contact: linux-crypto@vger.kernel.org
+Description: Set the wait time when stop queue fails. Available for both PF
+ and VF, and take no other effect on SEC.
+ 0: not wait(default), others value: wait dev_timeout * 20 microsecond.
+
+What: /sys/kernel/debug/hisi_sec2/<bdf>/qm/dev_state
+Date: Feb 2024
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the stop queue status of the QM. The default value is 0,
+ if dev_timeout is set, when stop queue fails, the dev_state
+ will return non-zero value. Available for both PF and VF,
+ and take no other effect on SEC.
+
What: /sys/kernel/debug/hisi_sec2/<bdf>/sec_dfx/diff_regs
Date: Mar 2022
Contact: linux-crypto@vger.kernel.org
diff --git a/Documentation/ABI/testing/debugfs-hisi-zip b/Documentation/ABI/testing/debugfs-hisi-zip
index 593714a..a22dd69 100644
--- a/Documentation/ABI/testing/debugfs-hisi-zip
+++ b/Documentation/ABI/testing/debugfs-hisi-zip
@@ -104,6 +104,28 @@
node is used to show the change of the qm registers value. This
node can be help users to check the change of register values.
+What: /sys/kernel/debug/hisi_zip/<bdf>/qm/qm_state
+Date: Jan 2024
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the state of the device.
+ 0: busy, 1: idle.
+ Only available for PF, and take no other effect on ZIP.
+
+What: /sys/kernel/debug/hisi_zip/<bdf>/qm/dev_timeout
+Date: Feb 2024
+Contact: linux-crypto@vger.kernel.org
+Description: Set the wait time when stop queue fails. Available for both PF
+ and VF, and take no other effect on ZIP.
+ 0: not wait(default), others value: wait dev_timeout * 20 microsecond.
+
+What: /sys/kernel/debug/hisi_zip/<bdf>/qm/dev_state
+Date: Feb 2024
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the stop queue status of the QM. The default value is 0,
+ if dev_timeout is set, when stop queue fails, the dev_state
+ will return non-zero value. Available for both PF and VF,
+ and take no other effect on ZIP.
+
What: /sys/kernel/debug/hisi_zip/<bdf>/zip_dfx/diff_regs
Date: Mar 2022
Contact: linux-crypto@vger.kernel.org
diff --git a/Documentation/ABI/testing/debugfs-intel-iommu b/Documentation/ABI/testing/debugfs-intel-iommu
new file mode 100644
index 0000000..2ab8464
--- /dev/null
+++ b/Documentation/ABI/testing/debugfs-intel-iommu
@@ -0,0 +1,276 @@
+What: /sys/kernel/debug/iommu/intel/iommu_regset
+Date: December 2023
+Contact: Jingqi Liu <Jingqi.liu@intel.com>
+Description:
+ This file dumps all the register contents for each IOMMU device.
+
+ Example in Kabylake:
+
+ ::
+
+ $ sudo cat /sys/kernel/debug/iommu/intel/iommu_regset
+
+ IOMMU: dmar0 Register Base Address: 26be37000
+
+ Name Offset Contents
+ VER 0x00 0x0000000000000010
+ GCMD 0x18 0x0000000000000000
+ GSTS 0x1c 0x00000000c7000000
+ FSTS 0x34 0x0000000000000000
+ FECTL 0x38 0x0000000000000000
+
+ [...]
+
+ IOMMU: dmar1 Register Base Address: fed90000
+
+ Name Offset Contents
+ VER 0x00 0x0000000000000010
+ GCMD 0x18 0x0000000000000000
+ GSTS 0x1c 0x00000000c7000000
+ FSTS 0x34 0x0000000000000000
+ FECTL 0x38 0x0000000000000000
+
+ [...]
+
+ IOMMU: dmar2 Register Base Address: fed91000
+
+ Name Offset Contents
+ VER 0x00 0x0000000000000010
+ GCMD 0x18 0x0000000000000000
+ GSTS 0x1c 0x00000000c7000000
+ FSTS 0x34 0x0000000000000000
+ FECTL 0x38 0x0000000000000000
+
+ [...]
+
+What: /sys/kernel/debug/iommu/intel/ir_translation_struct
+Date: December 2023
+Contact: Jingqi Liu <Jingqi.liu@intel.com>
+Description:
+ This file dumps the table entries for Interrupt
+ remapping and Interrupt posting.
+
+ Example in Kabylake:
+
+ ::
+
+ $ sudo cat /sys/kernel/debug/iommu/intel/ir_translation_struct
+
+ Remapped Interrupt supported on IOMMU: dmar0
+ IR table address:100900000
+
+ Entry SrcID DstID Vct IRTE_high IRTE_low
+ 0 00:0a.0 00000080 24 0000000000040050 000000800024000d
+ 1 00:0a.0 00000001 ef 0000000000040050 0000000100ef000d
+
+ Remapped Interrupt supported on IOMMU: dmar1
+ IR table address:100300000
+ Entry SrcID DstID Vct IRTE_high IRTE_low
+ 0 00:02.0 00000002 26 0000000000040010 000000020026000d
+
+ [...]
+
+ ****
+
+ Posted Interrupt supported on IOMMU: dmar0
+ IR table address:100900000
+ Entry SrcID PDA_high PDA_low Vct IRTE_high IRTE_low
+
+What: /sys/kernel/debug/iommu/intel/dmar_translation_struct
+Date: December 2023
+Contact: Jingqi Liu <Jingqi.liu@intel.com>
+Description:
+ This file dumps Intel IOMMU DMA remapping tables, such
+ as root table, context table, PASID directory and PASID
+ table entries in debugfs. For legacy mode, it doesn't
+ support PASID, and hence PASID field is defaulted to
+ '-1' and other PASID related fields are invalid.
+
+ Example in Kabylake:
+
+ ::
+
+ $ sudo cat /sys/kernel/debug/iommu/intel/dmar_translation_struct
+
+ IOMMU dmar1: Root Table Address: 0x103027000
+ B.D.F Root_entry
+ 00:02.0 0x0000000000000000:0x000000010303e001
+
+ Context_entry
+ 0x0000000000000102:0x000000010303f005
+
+ PASID PASID_table_entry
+ -1 0x0000000000000000:0x0000000000000000:0x0000000000000000
+
+ IOMMU dmar0: Root Table Address: 0x103028000
+ B.D.F Root_entry
+ 00:0a.0 0x0000000000000000:0x00000001038a7001
+
+ Context_entry
+ 0x0000000000000000:0x0000000103220e7d
+
+ PASID PASID_table_entry
+ 0 0x0000000000000000:0x0000000000800002:0x00000001038a5089
+
+ [...]
+
+What: /sys/kernel/debug/iommu/intel/invalidation_queue
+Date: December 2023
+Contact: Jingqi Liu <Jingqi.liu@intel.com>
+Description:
+ This file exports invalidation queue internals of each
+ IOMMU device.
+
+ Example in Kabylake:
+
+ ::
+
+ $ sudo cat /sys/kernel/debug/iommu/intel/invalidation_queue
+
+ Invalidation queue on IOMMU: dmar0
+ Base: 0x10022e000 Head: 20 Tail: 20
+ Index qw0 qw1 qw2
+ 0 0000000000000014 0000000000000000 0000000000000000
+ 1 0000000200000025 0000000100059c04 0000000000000000
+ 2 0000000000000014 0000000000000000 0000000000000000
+
+ qw3 status
+ 0000000000000000 0000000000000000
+ 0000000000000000 0000000000000000
+ 0000000000000000 0000000000000000
+
+ [...]
+
+ Invalidation queue on IOMMU: dmar1
+ Base: 0x10026e000 Head: 32 Tail: 32
+ Index qw0 qw1 status
+ 0 0000000000000004 0000000000000000 0000000000000000
+ 1 0000000200000025 0000000100059804 0000000000000000
+ 2 0000000000000011 0000000000000000 0000000000000000
+
+ [...]
+
+What: /sys/kernel/debug/iommu/intel/dmar_perf_latency
+Date: December 2023
+Contact: Jingqi Liu <Jingqi.liu@intel.com>
+Description:
+ This file is used to control and show counts of
+ execution time ranges for various types per DMAR.
+
+ Firstly, write a value to
+ /sys/kernel/debug/iommu/intel/dmar_perf_latency
+ to enable sampling.
+
+ The possible values are as follows:
+
+ * 0 - disable sampling all latency data
+
+ * 1 - enable sampling IOTLB invalidation latency data
+
+ * 2 - enable sampling devTLB invalidation latency data
+
+ * 3 - enable sampling intr entry cache invalidation latency data
+
+ Next, read /sys/kernel/debug/iommu/intel/dmar_perf_latency gives
+ a snapshot of sampling result of all enabled monitors.
+
+ Examples in Kabylake:
+
+ ::
+
+ 1) Disable sampling all latency data:
+
+ $ sudo echo 0 > /sys/kernel/debug/iommu/intel/dmar_perf_latency
+
+ 2) Enable sampling IOTLB invalidation latency data
+
+ $ sudo echo 1 > /sys/kernel/debug/iommu/intel/dmar_perf_latency
+
+ $ sudo cat /sys/kernel/debug/iommu/intel/dmar_perf_latency
+
+ IOMMU: dmar0 Register Base Address: 26be37000
+ <0.1us 0.1us-1us 1us-10us 10us-100us 100us-1ms
+ inv_iotlb 0 0 0 0 0
+
+ 1ms-10ms >=10ms min(us) max(us) average(us)
+ inv_iotlb 0 0 0 0 0
+
+ [...]
+
+ IOMMU: dmar2 Register Base Address: fed91000
+ <0.1us 0.1us-1us 1us-10us 10us-100us 100us-1ms
+ inv_iotlb 0 0 18 0 0
+
+ 1ms-10ms >=10ms min(us) max(us) average(us)
+ inv_iotlb 0 0 2 2 2
+
+ 3) Enable sampling devTLB invalidation latency data
+
+ $ sudo echo 2 > /sys/kernel/debug/iommu/intel/dmar_perf_latency
+
+ $ sudo cat /sys/kernel/debug/iommu/intel/dmar_perf_latency
+
+ IOMMU: dmar0 Register Base Address: 26be37000
+ <0.1us 0.1us-1us 1us-10us 10us-100us 100us-1ms
+ inv_devtlb 0 0 0 0 0
+
+ >=10ms min(us) max(us) average(us)
+ inv_devtlb 0 0 0 0
+
+ [...]
+
+What: /sys/kernel/debug/iommu/intel/<bdf>/domain_translation_struct
+Date: December 2023
+Contact: Jingqi Liu <Jingqi.liu@intel.com>
+Description:
+ This file dumps a specified page table of Intel IOMMU
+ in legacy mode or scalable mode.
+
+ For a device that only supports legacy mode, dump its
+ page table by the debugfs file in the debugfs device
+ directory. e.g.
+ /sys/kernel/debug/iommu/intel/0000:00:02.0/domain_translation_struct.
+
+ For a device that supports scalable mode, dump the
+ page table of specified pasid by the debugfs file in
+ the debugfs pasid directory. e.g.
+ /sys/kernel/debug/iommu/intel/0000:00:02.0/1/domain_translation_struct.
+
+ Examples in Kabylake:
+
+ ::
+
+ 1) Dump the page table of device "0000:00:02.0" that only supports legacy mode.
+
+ $ sudo cat /sys/kernel/debug/iommu/intel/0000:00:02.0/domain_translation_struct
+
+ Device 0000:00:02.0 @0x1017f8000
+ IOVA_PFN PML5E PML4E
+ 0x000000008d800 | 0x0000000000000000 0x00000001017f9003
+ 0x000000008d801 | 0x0000000000000000 0x00000001017f9003
+ 0x000000008d802 | 0x0000000000000000 0x00000001017f9003
+
+ PDPE PDE PTE
+ 0x00000001017fa003 0x00000001017fb003 0x000000008d800003
+ 0x00000001017fa003 0x00000001017fb003 0x000000008d801003
+ 0x00000001017fa003 0x00000001017fb003 0x000000008d802003
+
+ [...]
+
+ 2) Dump the page table of device "0000:00:0a.0" with PASID "1" that
+ supports scalable mode.
+
+ $ sudo cat /sys/kernel/debug/iommu/intel/0000:00:0a.0/1/domain_translation_struct
+
+ Device 0000:00:0a.0 with pasid 1 @0x10c112000
+ IOVA_PFN PML5E PML4E
+ 0x0000000000000 | 0x0000000000000000 0x000000010df93003
+ 0x0000000000001 | 0x0000000000000000 0x000000010df93003
+ 0x0000000000002 | 0x0000000000000000 0x000000010df93003
+
+ PDPE PDE PTE
+ 0x0000000106ae6003 0x0000000104b38003 0x0000000147c00803
+ 0x0000000106ae6003 0x0000000104b38003 0x0000000147c01803
+ 0x0000000106ae6003 0x0000000104b38003 0x0000000147c02803
+
+ [...]
diff --git a/Documentation/ABI/testing/gpio-cdev b/Documentation/ABI/testing/gpio-cdev
index 66bdcd1..c9689b2 100644
--- a/Documentation/ABI/testing/gpio-cdev
+++ b/Documentation/ABI/testing/gpio-cdev
@@ -6,8 +6,9 @@
The character device files /dev/gpiochip* are the interface
between GPIO chips and userspace.
- The ioctl(2)-based ABI is defined and documented in
- [include/uapi]<linux/gpio.h>.
+ The ioctl(2)-based ABI is defined in
+ [include/uapi]<linux/gpio.h> and documented in
+ Documentation/userspace-api/gpio/chardev.rst.
The following file operations are supported:
@@ -17,8 +18,8 @@
ioctl(2)
Initiate various actions.
- See the inline documentation in [include/uapi]<linux/gpio.h>
- for descriptions of all ioctls.
+ See Documentation/userspace-api/gpio/chardev.rst
+ for a description of all ioctls.
close(2)
Stops and free up the I/O contexts that was associated
diff --git a/Documentation/ABI/testing/sysfs-bus-coresight-devices-tpdm b/Documentation/ABI/testing/sysfs-bus-coresight-devices-tpdm
index 4dd49b1..b4d0fc8 100644
--- a/Documentation/ABI/testing/sysfs-bus-coresight-devices-tpdm
+++ b/Documentation/ABI/testing/sysfs-bus-coresight-devices-tpdm
@@ -170,3 +170,90 @@
Description:
(RW) Set/Get the MSR(mux select register) for the DSB subunit
TPDM.
+
+What: /sys/bus/coresight/devices/<tpdm-name>/cmb_mode
+Date: January 2024
+KernelVersion 6.9
+Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com>
+Description: (Write) Set the data collection mode of CMB tpdm. Continuous
+ change creates CMB data set elements on every CMBCLK edge.
+ Trace-on-change creates CMB data set elements only when a new
+ data set element differs in value from the previous element
+ in a CMB data set.
+
+ Accepts only one of the 2 values - 0 or 1.
+ 0 : Continuous CMB collection mode.
+ 1 : Trace-on-change CMB collection mode.
+
+What: /sys/bus/coresight/devices/<tpdm-name>/cmb_trig_patt/xpr[0:1]
+Date: January 2024
+KernelVersion 6.9
+Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com>
+Description:
+ (RW) Set/Get the value of the trigger pattern for the CMB
+ subunit TPDM.
+
+What: /sys/bus/coresight/devices/<tpdm-name>/cmb_trig_patt/xpmr[0:1]
+Date: January 2024
+KernelVersion 6.9
+Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com>
+Description:
+ (RW) Set/Get the mask of the trigger pattern for the CMB
+ subunit TPDM.
+
+What: /sys/bus/coresight/devices/<tpdm-name>/dsb_patt/tpr[0:1]
+Date: January 2024
+KernelVersion 6.9
+Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com>
+Description:
+ (RW) Set/Get the value of the pattern for the CMB subunit TPDM.
+
+What: /sys/bus/coresight/devices/<tpdm-name>/dsb_patt/tpmr[0:1]
+Date: January 2024
+KernelVersion 6.9
+Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com>
+Description:
+ (RW) Set/Get the mask of the pattern for the CMB subunit TPDM.
+
+What: /sys/bus/coresight/devices/<tpdm-name>/cmb_patt/enable_ts
+Date: January 2024
+KernelVersion 6.9
+Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com>
+Description:
+ (Write) Set the pattern timestamp of CMB tpdm. Read
+ the pattern timestamp of CMB tpdm.
+
+ Accepts only one of the 2 values - 0 or 1.
+ 0 : Disable CMB pattern timestamp.
+ 1 : Enable CMB pattern timestamp.
+
+What: /sys/bus/coresight/devices/<tpdm-name>/cmb_trig_ts
+Date: January 2024
+KernelVersion 6.9
+Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com>
+Description:
+ (RW) Set/Get the trigger timestamp of the CMB for tpdm.
+
+ Accepts only one of the 2 values - 0 or 1.
+ 0 : Set the CMB trigger type to false
+ 1 : Set the CMB trigger type to true
+
+What: /sys/bus/coresight/devices/<tpdm-name>/cmb_ts_all
+Date: January 2024
+KernelVersion 6.9
+Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com>
+Description:
+ (RW) Read or write the status of timestamp upon all interface.
+ Only value 0 and 1 can be written to this node. Set this node to 1 to requeset
+ timestamp to all trace packet.
+ Accepts only one of the 2 values - 0 or 1.
+ 0 : Disable the timestamp of all trace packets.
+ 1 : Enable the timestamp of all trace packets.
+
+What: /sys/bus/coresight/devices/<tpdm-name>/cmb_msr/msr[0:31]
+Date: January 2024
+KernelVersion 6.9
+Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com>
+Description:
+ (RW) Set/Get the MSR(mux select register) for the CMB subunit
+ TPDM.
diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl
index fff2581..3f5627a 100644
--- a/Documentation/ABI/testing/sysfs-bus-cxl
+++ b/Documentation/ABI/testing/sysfs-bus-cxl
@@ -552,3 +552,37 @@
attribute is only visible for devices supporting the
capability. The retrieved errors are logged as kernel
events when cxl_poison event tracing is enabled.
+
+
+What: /sys/bus/cxl/devices/regionZ/accessY/read_bandwidth
+ /sys/bus/cxl/devices/regionZ/accessY/write_banwidth
+Date: Jan, 2024
+KernelVersion: v6.9
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (RO) The aggregated read or write bandwidth of the region. The
+ number is the accumulated read or write bandwidth of all CXL memory
+ devices that contributes to the region in MB/s. It is
+ identical data that should appear in
+ /sys/devices/system/node/nodeX/accessY/initiators/read_bandwidth or
+ /sys/devices/system/node/nodeX/accessY/initiators/write_bandwidth.
+ See Documentation/ABI/stable/sysfs-devices-node. access0 provides
+ the number to the closest initiator and access1 provides the
+ number to the closest CPU.
+
+
+What: /sys/bus/cxl/devices/regionZ/accessY/read_latency
+ /sys/bus/cxl/devices/regionZ/accessY/write_latency
+Date: Jan, 2024
+KernelVersion: v6.9
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (RO) The read or write latency of the region. The number is
+ the worst read or write latency of all CXL memory devices that
+ contributes to the region in nanoseconds. It is identical data
+ that should appear in
+ /sys/devices/system/node/nodeX/accessY/initiators/read_latency or
+ /sys/devices/system/node/nodeX/accessY/initiators/write_latency.
+ See Documentation/ABI/stable/sysfs-devices-node. access0 provides
+ the number to the closest initiator and access1 provides the
+ number to the closest CPU.
diff --git a/Documentation/ABI/testing/sysfs-bus-dax b/Documentation/ABI/testing/sysfs-bus-dax
new file mode 100644
index 0000000..b34266b
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-dax
@@ -0,0 +1,153 @@
+What: /sys/bus/dax/devices/daxX.Y/align
+Date: October, 2020
+KernelVersion: v5.10
+Contact: nvdimm@lists.linux.dev
+Description:
+ (RW) Provides a way to specify an alignment for a dax device.
+ Values allowed are constrained by the physical address ranges
+ that back the dax device, and also by arch requirements.
+
+What: /sys/bus/dax/devices/daxX.Y/mapping
+Date: October, 2020
+KernelVersion: v5.10
+Contact: nvdimm@lists.linux.dev
+Description:
+ (WO) Provides a way to allocate a mapping range under a dax
+ device. Specified in the format <start>-<end>.
+
+What: /sys/bus/dax/devices/daxX.Y/mapping[0..N]/start
+What: /sys/bus/dax/devices/daxX.Y/mapping[0..N]/end
+What: /sys/bus/dax/devices/daxX.Y/mapping[0..N]/page_offset
+Date: October, 2020
+KernelVersion: v5.10
+Contact: nvdimm@lists.linux.dev
+Description:
+ (RO) A dax device may have multiple constituent discontiguous
+ address ranges. These are represented by the different
+ 'mappingX' subdirectories. The 'start' attribute indicates the
+ start physical address for the given range. The 'end' attribute
+ indicates the end physical address for the given range. The
+ 'page_offset' attribute indicates the offset of the current
+ range in the dax device.
+
+What: /sys/bus/dax/devices/daxX.Y/resource
+Date: June, 2019
+KernelVersion: v5.3
+Contact: nvdimm@lists.linux.dev
+Description:
+ (RO) The resource attribute indicates the starting physical
+ address of a dax device. In case of a device with multiple
+ constituent ranges, it indicates the starting address of the
+ first range.
+
+What: /sys/bus/dax/devices/daxX.Y/size
+Date: October, 2020
+KernelVersion: v5.10
+Contact: nvdimm@lists.linux.dev
+Description:
+ (RW) The size attribute indicates the total size of a dax
+ device. For creating subdivided dax devices, or for resizing
+ an existing device, the new size can be written to this as
+ part of the reconfiguration process.
+
+What: /sys/bus/dax/devices/daxX.Y/numa_node
+Date: November, 2019
+KernelVersion: v5.5
+Contact: nvdimm@lists.linux.dev
+Description:
+ (RO) If NUMA is enabled and the platform has affinitized the
+ backing device for this dax device, emit the CPU node
+ affinity for this device.
+
+What: /sys/bus/dax/devices/daxX.Y/target_node
+Date: February, 2019
+KernelVersion: v5.1
+Contact: nvdimm@lists.linux.dev
+Description:
+ (RO) The target-node attribute is the Linux numa-node that a
+ device-dax instance may create when it is online. Prior to
+ being online the device's 'numa_node' property reflects the
+ closest online cpu node which is the typical expectation of a
+ device 'numa_node'. Once it is online it becomes its own
+ distinct numa node.
+
+What: $(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/available_size
+Date: October, 2020
+KernelVersion: v5.10
+Contact: nvdimm@lists.linux.dev
+Description:
+ (RO) The available_size attribute tracks available dax region
+ capacity. This only applies to volatile hmem devices, not pmem
+ devices, since pmem devices are defined by nvdimm namespace
+ boundaries.
+
+What: $(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/size
+Date: July, 2017
+KernelVersion: v5.1
+Contact: nvdimm@lists.linux.dev
+Description:
+ (RO) The size attribute indicates the size of a given dax region
+ in bytes.
+
+What: $(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/align
+Date: October, 2020
+KernelVersion: v5.10
+Contact: nvdimm@lists.linux.dev
+Description:
+ (RO) The align attribute indicates alignment of the dax region.
+ Changes on align may not always be valid, when say certain
+ mappings were created with 2M and then we switch to 1G. This
+ validates all ranges against the new value being attempted, post
+ resizing.
+
+What: $(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/seed
+Date: October, 2020
+KernelVersion: v5.10
+Contact: nvdimm@lists.linux.dev
+Description:
+ (RO) The seed device is a concept for dynamic dax regions to be
+ able to split the region amongst multiple sub-instances. The
+ seed device, similar to libnvdimm seed devices, is a device
+ that starts with zero capacity allocated and unbound to a
+ driver.
+
+What: $(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/create
+Date: October, 2020
+KernelVersion: v5.10
+Contact: nvdimm@lists.linux.dev
+Description:
+ (RW) The create interface to the dax region provides a way to
+ create a new unconfigured dax device under the given region, which
+ can then be configured (with a size etc.) and then probed.
+
+What: $(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/delete
+Date: October, 2020
+KernelVersion: v5.10
+Contact: nvdimm@lists.linux.dev
+Description:
+ (WO) The delete interface for a dax region provides for deletion
+ of any 0-sized and idle dax devices.
+
+What: $(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/id
+Date: July, 2017
+KernelVersion: v5.1
+Contact: nvdimm@lists.linux.dev
+Description:
+ (RO) The id attribute indicates the region id of a dax region.
+
+What: /sys/bus/dax/devices/daxX.Y/memmap_on_memory
+Date: January, 2024
+KernelVersion: v6.8
+Contact: nvdimm@lists.linux.dev
+Description:
+ (RW) Control the memmap_on_memory setting if the dax device
+ were to be hotplugged as system memory. This determines whether
+ the 'altmap' for the hotplugged memory will be placed on the
+ device being hotplugged (memmap_on_memory=1) or if it will be
+ placed on regular memory (memmap_on_memory=0). This attribute
+ must be set before the device is handed over to the 'kmem'
+ driver (i.e. hotplugged into system-ram). Additionally, this
+ depends on CONFIG_MHP_MEMMAP_ON_MEMORY, and a globally enabled
+ memmap_on_memory parameter for memory_hotplug. This is
+ typically set on the kernel command line -
+ memory_hotplug.memmap_on_memory set to 'true' or 'force'."
diff --git a/Documentation/ABI/testing/sysfs-bus-iio-adc-pac1934 b/Documentation/ABI/testing/sysfs-bus-iio-adc-pac1934
new file mode 100644
index 0000000..625b7f8
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-iio-adc-pac1934
@@ -0,0 +1,9 @@
+What: /sys/bus/iio/devices/iio:deviceX/in_shunt_resistorY
+KernelVersion: 6.7
+Contact: linux-iio@vger.kernel.org
+Description:
+ The value of the shunt resistor may be known only at runtime
+ and set by a client application. This attribute allows to
+ set its value in micro-ohms. X is the IIO index of the device.
+ Y is the channel number. The value is used to calculate
+ current, power and accumulated energy.
diff --git a/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats b/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
index 860db53..d1f67bb 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
+++ b/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
@@ -11,7 +11,7 @@
What: /sys/bus/pci/devices/<dev>/aer_dev_correctable
Date: July 2018
-KernelVersion: 4.19.0
+KernelVersion: 4.19.0
Contact: linux-pci@vger.kernel.org, rajatja@google.com
Description: List of correctable errors seen and reported by this
PCI device using ERR_COR. Note that since multiple errors may
@@ -32,7 +32,7 @@
What: /sys/bus/pci/devices/<dev>/aer_dev_fatal
Date: July 2018
-KernelVersion: 4.19.0
+KernelVersion: 4.19.0
Contact: linux-pci@vger.kernel.org, rajatja@google.com
Description: List of uncorrectable fatal errors seen and reported by this
PCI device using ERR_FATAL. Note that since multiple errors may
@@ -62,7 +62,7 @@
What: /sys/bus/pci/devices/<dev>/aer_dev_nonfatal
Date: July 2018
-KernelVersion: 4.19.0
+KernelVersion: 4.19.0
Contact: linux-pci@vger.kernel.org, rajatja@google.com
Description: List of uncorrectable nonfatal errors seen and reported by this
PCI device using ERR_NONFATAL. Note that since multiple errors
@@ -100,20 +100,20 @@
device, so these counters include them and are thus cumulative of all the error
messages on the PCI hierarchy originating at that root port.
-What: /sys/bus/pci/devices/<dev>/aer_stats/aer_rootport_total_err_cor
+What: /sys/bus/pci/devices/<dev>/aer_rootport_total_err_cor
Date: July 2018
-KernelVersion: 4.19.0
+KernelVersion: 4.19.0
Contact: linux-pci@vger.kernel.org, rajatja@google.com
Description: Total number of ERR_COR messages reported to rootport.
-What: /sys/bus/pci/devices/<dev>/aer_stats/aer_rootport_total_err_fatal
+What: /sys/bus/pci/devices/<dev>/aer_rootport_total_err_fatal
Date: July 2018
-KernelVersion: 4.19.0
+KernelVersion: 4.19.0
Contact: linux-pci@vger.kernel.org, rajatja@google.com
Description: Total number of ERR_FATAL messages reported to rootport.
-What: /sys/bus/pci/devices/<dev>/aer_stats/aer_rootport_total_err_nonfatal
+What: /sys/bus/pci/devices/<dev>/aer_rootport_total_err_nonfatal
Date: July 2018
-KernelVersion: 4.19.0
+KernelVersion: 4.19.0
Contact: linux-pci@vger.kernel.org, rajatja@google.com
Description: Total number of ERR_NONFATAL messages reported to rootport.
diff --git a/Documentation/ABI/testing/sysfs-bus-pci-devices-avs b/Documentation/ABI/testing/sysfs-bus-pci-devices-avs
new file mode 100644
index 0000000..ebff3fa
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-pci-devices-avs
@@ -0,0 +1,8 @@
+What: /sys/devices/pci0000:00/<dev>/avs/fw_version
+Date: February 2024
+Contact: Cezary Rojewski <cezary.rojewski@intel.com>
+Description:
+ Version of AudioDSP firmware ASoC avs driver is communicating
+ with.
+
+ Format: %d.%d.%d.%d, type:major:minor:build.
diff --git a/Documentation/ABI/testing/sysfs-bus-usb b/Documentation/ABI/testing/sysfs-bus-usb
index 2b7108e..af9b653 100644
--- a/Documentation/ABI/testing/sysfs-bus-usb
+++ b/Documentation/ABI/testing/sysfs-bus-usb
@@ -442,6 +442,16 @@
Description:
Contains the interface descriptors, in binary.
+What: /sys/bus/usb/devices/usbX/bos_descriptors
+Date: March 2024
+Contact: Elbert Mai <code@elbertmai.com>
+Description:
+ Binary file containing the cached binary device object store (BOS)
+ of the device. This consists of the BOS descriptor followed by the
+ set of device capability descriptors. All descriptors read from
+ this file are in bus-endian format. Note that the kernel will not
+ request the BOS from a device if its bcdUSB is less than 0x0201.
+
What: /sys/bus/usb/devices/usbX/idProduct
Description:
Product ID, in hexadecimal.
diff --git a/Documentation/ABI/testing/sysfs-bus-vdpa b/Documentation/ABI/testing/sysfs-bus-vdpa
index 4da5387..2c833b5 100644
--- a/Documentation/ABI/testing/sysfs-bus-vdpa
+++ b/Documentation/ABI/testing/sysfs-bus-vdpa
@@ -1,6 +1,6 @@
What: /sys/bus/vdpa/drivers_autoprobe
Date: March 2020
-Contact: virtualization@lists.linux-foundation.org
+Contact: virtualization@lists.linux.dev
Description:
This file determines whether new devices are immediately bound
to a driver after the creation. It initially contains 1, which
@@ -12,7 +12,7 @@
What: /sys/bus/vdpa/driver_probe
Date: March 2020
-Contact: virtualization@lists.linux-foundation.org
+Contact: virtualization@lists.linux.dev
Description:
Writing a device name to this file will cause the kernel binds
devices to a compatible driver.
@@ -22,7 +22,7 @@
What: /sys/bus/vdpa/drivers/.../bind
Date: March 2020
-Contact: virtualization@lists.linux-foundation.org
+Contact: virtualization@lists.linux.dev
Description:
Writing a device name to this file will cause the driver to
attempt to bind to the device. This is useful for overriding
@@ -30,7 +30,7 @@
What: /sys/bus/vdpa/drivers/.../unbind
Date: March 2020
-Contact: virtualization@lists.linux-foundation.org
+Contact: virtualization@lists.linux.dev
Description:
Writing a device name to this file will cause the driver to
attempt to unbind from the device. This may be useful when
@@ -38,7 +38,7 @@
What: /sys/bus/vdpa/devices/.../driver_override
Date: November 2021
-Contact: virtualization@lists.linux-foundation.org
+Contact: virtualization@lists.linux.dev
Description:
This file allows the driver for a device to be specified.
When specified, only a driver with a name matching the value
diff --git a/Documentation/ABI/testing/sysfs-class-hwmon b/Documentation/ABI/testing/sysfs-class-hwmon
index 3dac923..cfd0d0b 100644
--- a/Documentation/ABI/testing/sysfs-class-hwmon
+++ b/Documentation/ABI/testing/sysfs-class-hwmon
@@ -149,6 +149,15 @@
RW
+What: /sys/class/hwmon/hwmonX/inY_fault
+Description:
+ Reports a voltage hard failure (eg: shorted component)
+
+ - 1: Failed
+ - 0: Ok
+
+ RO
+
What: /sys/class/hwmon/hwmonX/cpuY_vid
Description:
CPU core reference voltage.
@@ -968,6 +977,15 @@
RW
+What: /sys/class/hwmon/hwmonX/humidityY_max_alarm
+Description:
+ Maximum humidity detection
+
+ - 0: OK
+ - 1: Maximum humidity detected
+
+ RO
+
What: /sys/class/hwmon/hwmonX/humidityY_max_hyst
Description:
Humidity hysteresis value for max limit.
@@ -987,6 +1005,15 @@
RW
+What: /sys/class/hwmon/hwmonX/humidityY_min_alarm
+Description:
+ Minimum humidity detection
+
+ - 0: OK
+ - 1: Minimum humidity detected
+
+ RO
+
What: /sys/class/hwmon/hwmonX/humidityY_min_hyst
Description:
Humidity hysteresis value for min limit.
diff --git a/Documentation/ABI/testing/sysfs-class-led-trigger-netdev b/Documentation/ABI/testing/sysfs-class-led-trigger-netdev
index a6c307c..ed46b37 100644
--- a/Documentation/ABI/testing/sysfs-class-led-trigger-netdev
+++ b/Documentation/ABI/testing/sysfs-class-led-trigger-netdev
@@ -88,6 +88,8 @@
speed of 10MBps of the named network device.
Setting this value also immediately changes the LED state.
+ Present only if the named network device supports 10Mbps link speed.
+
What: /sys/class/leds/<led>/link_100
Date: Jun 2023
KernelVersion: 6.5
@@ -101,6 +103,8 @@
speed of 100Mbps of the named network device.
Setting this value also immediately changes the LED state.
+ Present only if the named network device supports 100Mbps link speed.
+
What: /sys/class/leds/<led>/link_1000
Date: Jun 2023
KernelVersion: 6.5
@@ -114,6 +118,8 @@
speed of 1000Mbps of the named network device.
Setting this value also immediately changes the LED state.
+ Present only if the named network device supports 1000Mbps link speed.
+
What: /sys/class/leds/<led>/link_2500
Date: Nov 2023
KernelVersion: 6.8
@@ -127,6 +133,8 @@
speed of 2500Mbps of the named network device.
Setting this value also immediately changes the LED state.
+ Present only if the named network device supports 2500Mbps link speed.
+
What: /sys/class/leds/<led>/link_5000
Date: Nov 2023
KernelVersion: 6.8
@@ -140,6 +148,8 @@
speed of 5000Mbps of the named network device.
Setting this value also immediately changes the LED state.
+ Present only if the named network device supports 5000Mbps link speed.
+
What: /sys/class/leds/<led>/link_10000
Date: Nov 2023
KernelVersion: 6.8
@@ -153,6 +163,8 @@
speed of 10000Mbps of the named network device.
Setting this value also immediately changes the LED state.
+ Present only if the named network device supports 10000Mbps link speed.
+
What: /sys/class/leds/<led>/half_duplex
Date: Jun 2023
KernelVersion: 6.5
diff --git a/Documentation/ABI/testing/sysfs-class-led-trigger-tty b/Documentation/ABI/testing/sysfs-class-led-trigger-tty
index 30cef9a..308fbc3 100644
--- a/Documentation/ABI/testing/sysfs-class-led-trigger-tty
+++ b/Documentation/ABI/testing/sysfs-class-led-trigger-tty
@@ -1,11 +1,11 @@
-What: /sys/class/leds/<led>/ttyname
+What: /sys/class/leds/<tty_led>/ttyname
Date: Dec 2020
KernelVersion: 5.10
Contact: linux-leds@vger.kernel.org
Description:
Specifies the tty device name of the triggering tty
-What: /sys/class/leds/<led>/rx
+What: /sys/class/leds/<tty_led>/rx
Date: February 2024
KernelVersion: 6.8
Description:
@@ -13,7 +13,7 @@
If set to 0, the LED will not blink on reception.
If set to 1 (default), the LED will blink on reception.
-What: /sys/class/leds/<led>/tx
+What: /sys/class/leds/<tty_led>/tx
Date: February 2024
KernelVersion: 6.8
Description:
@@ -21,7 +21,7 @@
If set to 0, the LED will not blink on transmission.
If set to 1 (default), the LED will blink on transmission.
-What: /sys/class/leds/<led>/cts
+What: /sys/class/leds/<tty_led>/cts
Date: February 2024
KernelVersion: 6.8
Description:
@@ -31,7 +31,7 @@
If set to 0 (default), the LED will not evaluate CTS.
If set to 1, the LED will evaluate CTS.
-What: /sys/class/leds/<led>/dsr
+What: /sys/class/leds/<tty_led>/dsr
Date: February 2024
KernelVersion: 6.8
Description:
@@ -41,7 +41,7 @@
If set to 0 (default), the LED will not evaluate DSR.
If set to 1, the LED will evaluate DSR.
-What: /sys/class/leds/<led>/dcd
+What: /sys/class/leds/<tty_led>/dcd
Date: February 2024
KernelVersion: 6.8
Description:
@@ -51,7 +51,7 @@
If set to 0 (default), the LED will not evaluate CAR (DCD).
If set to 1, the LED will evaluate CAR (DCD).
-What: /sys/class/leds/<led>/rng
+What: /sys/class/leds/<tty_led>/rng
Date: February 2024
KernelVersion: 6.8
Description:
diff --git a/Documentation/ABI/testing/sysfs-class-net-queues b/Documentation/ABI/testing/sysfs-class-net-queues
index 5bff64d..84aa25e 100644
--- a/Documentation/ABI/testing/sysfs-class-net-queues
+++ b/Documentation/ABI/testing/sysfs-class-net-queues
@@ -96,3 +96,26 @@
Indicates the absolute minimum limit of bytes allowed to be
queued on this network device transmit queue. Default value is
0.
+
+What: /sys/class/net/<iface>/queues/tx-<queue>/byte_queue_limits/stall_thrs
+Date: Jan 2024
+KernelVersion: 6.9
+Contact: netdev@vger.kernel.org
+Description:
+ Tx completion stall detection threshold in ms. Kernel will
+ guarantee to detect all stalls longer than this threshold but
+ may also detect stalls longer than half of the threshold.
+
+What: /sys/class/net/<iface>/queues/tx-<queue>/byte_queue_limits/stall_cnt
+Date: Jan 2024
+KernelVersion: 6.9
+Contact: netdev@vger.kernel.org
+Description:
+ Number of detected Tx completion stalls.
+
+What: /sys/class/net/<iface>/queues/tx-<queue>/byte_queue_limits/stall_max
+Date: Jan 2024
+KernelVersion: 6.9
+Contact: netdev@vger.kernel.org
+Description:
+ Longest detected Tx completion stall. Write 0 to clear.
diff --git a/Documentation/ABI/testing/sysfs-class-net-statistics b/Documentation/ABI/testing/sysfs-class-net-statistics
index 55db278..53e508c 100644
--- a/Documentation/ABI/testing/sysfs-class-net-statistics
+++ b/Documentation/ABI/testing/sysfs-class-net-statistics
@@ -1,4 +1,4 @@
-What: /sys/class/<iface>/statistics/collisions
+What: /sys/class/net/<iface>/statistics/collisions
Date: April 2005
KernelVersion: 2.6.12
Contact: netdev@vger.kernel.org
@@ -6,7 +6,7 @@
Indicates the number of collisions seen by this network device.
This value might not be relevant with all MAC layers.
-What: /sys/class/<iface>/statistics/multicast
+What: /sys/class/net/<iface>/statistics/multicast
Date: April 2005
KernelVersion: 2.6.12
Contact: netdev@vger.kernel.org
@@ -14,7 +14,7 @@
Indicates the number of multicast packets received by this
network device.
-What: /sys/class/<iface>/statistics/rx_bytes
+What: /sys/class/net/<iface>/statistics/rx_bytes
Date: April 2005
KernelVersion: 2.6.12
Contact: netdev@vger.kernel.org
@@ -23,7 +23,7 @@
See the network driver for the exact meaning of when this
value is incremented.
-What: /sys/class/<iface>/statistics/rx_compressed
+What: /sys/class/net/<iface>/statistics/rx_compressed
Date: April 2005
KernelVersion: 2.6.12
Contact: netdev@vger.kernel.org
@@ -32,7 +32,7 @@
network device. This value might only be relevant for interfaces
that support packet compression (e.g: PPP).
-What: /sys/class/<iface>/statistics/rx_crc_errors
+What: /sys/class/net/<iface>/statistics/rx_crc_errors
Date: April 2005
KernelVersion: 2.6.12
Contact: netdev@vger.kernel.org
@@ -41,7 +41,7 @@
by this network device. Note that the specific meaning might
depend on the MAC layer used by the interface.
-What: /sys/class/<iface>/statistics/rx_dropped
+What: /sys/class/net/<iface>/statistics/rx_dropped
Date: April 2005
KernelVersion: 2.6.12
Contact: netdev@vger.kernel.org
@@ -51,7 +51,7 @@
packet processing. See the network driver for the exact
meaning of this value.
-What: /sys/class/<iface>/statistics/rx_errors
+What: /sys/class/net/<iface>/statistics/rx_errors
Date: April 2005
KernelVersion: 2.6.12
Contact: netdev@vger.kernel.org
@@ -59,7 +59,7 @@
Indicates the number of receive errors on this network device.
See the network driver for the exact meaning of this value.
-What: /sys/class/<iface>/statistics/rx_fifo_errors
+What: /sys/class/net/<iface>/statistics/rx_fifo_errors
Date: April 2005
KernelVersion: 2.6.12
Contact: netdev@vger.kernel.org
@@ -68,7 +68,7 @@
network device. See the network driver for the exact
meaning of this value.
-What: /sys/class/<iface>/statistics/rx_frame_errors
+What: /sys/class/net/<iface>/statistics/rx_frame_errors
Date: April 2005
KernelVersion: 2.6.12
Contact: netdev@vger.kernel.org
@@ -78,7 +78,7 @@
on the MAC layer protocol used. See the network driver for
the exact meaning of this value.
-What: /sys/class/<iface>/statistics/rx_length_errors
+What: /sys/class/net/<iface>/statistics/rx_length_errors
Date: April 2005
KernelVersion: 2.6.12
Contact: netdev@vger.kernel.org
@@ -87,7 +87,7 @@
error, oversized or undersized. See the network driver for the
exact meaning of this value.
-What: /sys/class/<iface>/statistics/rx_missed_errors
+What: /sys/class/net/<iface>/statistics/rx_missed_errors
Date: April 2005
KernelVersion: 2.6.12
Contact: netdev@vger.kernel.org
@@ -96,7 +96,7 @@
due to lack of capacity in the receive side. See the network
driver for the exact meaning of this value.
-What: /sys/class/<iface>/statistics/rx_nohandler
+What: /sys/class/net/<iface>/statistics/rx_nohandler
Date: February 2016
KernelVersion: 4.6
Contact: netdev@vger.kernel.org
@@ -104,7 +104,7 @@
Indicates the number of received packets that were dropped on
an inactive device by the network core.
-What: /sys/class/<iface>/statistics/rx_over_errors
+What: /sys/class/net/<iface>/statistics/rx_over_errors
Date: April 2005
KernelVersion: 2.6.12
Contact: netdev@vger.kernel.org
@@ -114,7 +114,7 @@
(e.g: larger than MTU). See the network driver for the exact
meaning of this value.
-What: /sys/class/<iface>/statistics/rx_packets
+What: /sys/class/net/<iface>/statistics/rx_packets
Date: April 2005
KernelVersion: 2.6.12
Contact: netdev@vger.kernel.org
@@ -122,7 +122,7 @@
Indicates the total number of good packets received by this
network device.
-What: /sys/class/<iface>/statistics/tx_aborted_errors
+What: /sys/class/net/<iface>/statistics/tx_aborted_errors
Date: April 2005
KernelVersion: 2.6.12
Contact: netdev@vger.kernel.org
@@ -132,7 +132,7 @@
a medium collision). See the network driver for the exact
meaning of this value.
-What: /sys/class/<iface>/statistics/tx_bytes
+What: /sys/class/net/<iface>/statistics/tx_bytes
Date: April 2005
KernelVersion: 2.6.12
Contact: netdev@vger.kernel.org
@@ -143,7 +143,7 @@
transmitted packets or all packets that have been queued for
transmission.
-What: /sys/class/<iface>/statistics/tx_carrier_errors
+What: /sys/class/net/<iface>/statistics/tx_carrier_errors
Date: April 2005
KernelVersion: 2.6.12
Contact: netdev@vger.kernel.org
@@ -152,7 +152,7 @@
because of carrier errors (e.g: physical link down). See the
network driver for the exact meaning of this value.
-What: /sys/class/<iface>/statistics/tx_compressed
+What: /sys/class/net/<iface>/statistics/tx_compressed
Date: April 2005
KernelVersion: 2.6.12
Contact: netdev@vger.kernel.org
@@ -161,7 +161,7 @@
this might only be relevant for devices that support
compression (e.g: PPP).
-What: /sys/class/<iface>/statistics/tx_dropped
+What: /sys/class/net/<iface>/statistics/tx_dropped
Date: April 2005
KernelVersion: 2.6.12
Contact: netdev@vger.kernel.org
@@ -170,7 +170,7 @@
See the driver for the exact reasons as to why the packets were
dropped.
-What: /sys/class/<iface>/statistics/tx_errors
+What: /sys/class/net/<iface>/statistics/tx_errors
Date: April 2005
KernelVersion: 2.6.12
Contact: netdev@vger.kernel.org
@@ -179,7 +179,7 @@
a network device. See the driver for the exact reasons as to
why the packets were dropped.
-What: /sys/class/<iface>/statistics/tx_fifo_errors
+What: /sys/class/net/<iface>/statistics/tx_fifo_errors
Date: April 2005
KernelVersion: 2.6.12
Contact: netdev@vger.kernel.org
@@ -188,7 +188,7 @@
FIFO error. See the driver for the exact reasons as to why the
packets were dropped.
-What: /sys/class/<iface>/statistics/tx_heartbeat_errors
+What: /sys/class/net/<iface>/statistics/tx_heartbeat_errors
Date: April 2005
KernelVersion: 2.6.12
Contact: netdev@vger.kernel.org
@@ -197,7 +197,7 @@
reported as heartbeat errors. See the driver for the exact
reasons as to why the packets were dropped.
-What: /sys/class/<iface>/statistics/tx_packets
+What: /sys/class/net/<iface>/statistics/tx_packets
Date: April 2005
KernelVersion: 2.6.12
Contact: netdev@vger.kernel.org
@@ -206,7 +206,7 @@
device. See the driver for whether this reports the number of all
attempted or successful transmissions.
-What: /sys/class/<iface>/statistics/tx_window_errors
+What: /sys/class/net/<iface>/statistics/tx_window_errors
Date: April 2005
KernelVersion: 2.6.12
Contact: netdev@vger.kernel.org
diff --git a/Documentation/ABI/testing/sysfs-class-usb_role b/Documentation/ABI/testing/sysfs-class-usb_role
index 3b810a4..9fab3f0 100644
--- a/Documentation/ABI/testing/sysfs-class-usb_role
+++ b/Documentation/ABI/testing/sysfs-class-usb_role
@@ -19,3 +19,9 @@
- none
- host
- device
+
+What: /sys/class/usb_role/<switch>/connector
+Date: Feb 2024
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ Optional symlink to the USB Type-C connector.
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index a1db6db..710d47b 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -516,6 +516,7 @@
/sys/devices/system/cpu/vulnerabilities/mds
/sys/devices/system/cpu/vulnerabilities/meltdown
/sys/devices/system/cpu/vulnerabilities/mmio_stale_data
+ /sys/devices/system/cpu/vulnerabilities/reg_file_data_sampling
/sys/devices/system/cpu/vulnerabilities/retbleed
/sys/devices/system/cpu/vulnerabilities/spec_store_bypass
/sys/devices/system/cpu/vulnerabilities/spectre_v1
diff --git a/Documentation/ABI/testing/sysfs-driver-qat b/Documentation/ABI/testing/sysfs-driver-qat
index bbf329c..96020fb 100644
--- a/Documentation/ABI/testing/sysfs-driver-qat
+++ b/Documentation/ABI/testing/sysfs-driver-qat
@@ -141,3 +141,23 @@
64
This attribute is only available for qat_4xxx devices.
+
+What: /sys/bus/pci/devices/<BDF>/qat/auto_reset
+Date: March 2024
+KernelVersion: 6.8
+Contact: qat-linux@intel.com
+Description: (RW) Reports the current state of the autoreset feature
+ for a QAT device
+
+ Write to the attribute to enable or disable device auto reset.
+
+ Device auto reset is disabled by default.
+
+ The values are:
+
+ * 1/Yy/on: auto reset enabled. If the device encounters an
+ unrecoverable error, it will be reset automatically.
+ * 0/Nn/off: auto reset disabled. If the device encounters an
+ unrecoverable error, it will not be reset.
+
+ This attribute is only available for qat_4xxx devices.
diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs
index 99fa87a..1a4d839 100644
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -205,7 +205,7 @@
What: /sys/fs/f2fs/<disk>/discard_idle_interval
Date: September 2018
Contact: "Chao Yu" <yuchao0@huawei.com>
-Contact: "Sahitya Tummala" <stummala@codeaurora.org>
+Contact: "Sahitya Tummala" <quic_stummala@quicinc.com>
Description: Controls the idle timing of discard thread given
this time interval.
Default is 5 secs.
@@ -213,7 +213,7 @@
What: /sys/fs/f2fs/<disk>/gc_idle_interval
Date: September 2018
Contact: "Chao Yu" <yuchao0@huawei.com>
-Contact: "Sahitya Tummala" <stummala@codeaurora.org>
+Contact: "Sahitya Tummala" <quic_stummala@quicinc.com>
Description: Controls the idle timing for gc path. Set to 5 seconds by default.
What: /sys/fs/f2fs/<disk>/iostat_enable
@@ -701,29 +701,31 @@
enabled with fault_injection option, fault type value
is shown below, it supports single or combined type.
- =================== ===========
- Type_Name Type_Value
- =================== ===========
- FAULT_KMALLOC 0x000000001
- FAULT_KVMALLOC 0x000000002
- FAULT_PAGE_ALLOC 0x000000004
- FAULT_PAGE_GET 0x000000008
- FAULT_ALLOC_BIO 0x000000010 (obsolete)
- FAULT_ALLOC_NID 0x000000020
- FAULT_ORPHAN 0x000000040
- FAULT_BLOCK 0x000000080
- FAULT_DIR_DEPTH 0x000000100
- FAULT_EVICT_INODE 0x000000200
- FAULT_TRUNCATE 0x000000400
- FAULT_READ_IO 0x000000800
- FAULT_CHECKPOINT 0x000001000
- FAULT_DISCARD 0x000002000
- FAULT_WRITE_IO 0x000004000
- FAULT_SLAB_ALLOC 0x000008000
- FAULT_DQUOT_INIT 0x000010000
- FAULT_LOCK_OP 0x000020000
- FAULT_BLKADDR 0x000040000
- =================== ===========
+ =========================== ===========
+ Type_Name Type_Value
+ =========================== ===========
+ FAULT_KMALLOC 0x000000001
+ FAULT_KVMALLOC 0x000000002
+ FAULT_PAGE_ALLOC 0x000000004
+ FAULT_PAGE_GET 0x000000008
+ FAULT_ALLOC_BIO 0x000000010 (obsolete)
+ FAULT_ALLOC_NID 0x000000020
+ FAULT_ORPHAN 0x000000040
+ FAULT_BLOCK 0x000000080
+ FAULT_DIR_DEPTH 0x000000100
+ FAULT_EVICT_INODE 0x000000200
+ FAULT_TRUNCATE 0x000000400
+ FAULT_READ_IO 0x000000800
+ FAULT_CHECKPOINT 0x000001000
+ FAULT_DISCARD 0x000002000
+ FAULT_WRITE_IO 0x000004000
+ FAULT_SLAB_ALLOC 0x000008000
+ FAULT_DQUOT_INIT 0x000010000
+ FAULT_LOCK_OP 0x000020000
+ FAULT_BLKADDR_VALIDITY 0x000040000
+ FAULT_BLKADDR_CONSISTENCE 0x000080000
+ FAULT_NO_SEGMENT 0x000100000
+ =========================== ===========
What: /sys/fs/f2fs/<disk>/discard_io_aware_gran
Date: January 2023
diff --git a/Documentation/ABI/testing/sysfs-fs-virtiofs b/Documentation/ABI/testing/sysfs-fs-virtiofs
new file mode 100644
index 0000000..4839dbc
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-fs-virtiofs
@@ -0,0 +1,11 @@
+What: /sys/fs/virtiofs/<n>/tag
+Date: Feb 2024
+Contact: virtio-fs@lists.linux.dev
+Description:
+ [RO] The mount "tag" that can be used to mount this filesystem.
+
+What: /sys/fs/virtiofs/<n>/device
+Date: Feb 2024
+Contact: virtio-fs@lists.linux.dev
+Description:
+ Symlink to the virtio device that exports this filesystem.
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-cma b/Documentation/ABI/testing/sysfs-kernel-mm-cma
index 02b2bb6..dfd7552 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-cma
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-cma
@@ -23,3 +23,9 @@
Contact: Minchan Kim <minchan@kernel.org>
Description:
the number of pages CMA API failed to allocate
+
+What: /sys/kernel/mm/cma/<cma-heap-name>/release_pages_success
+Date: Feb 2024
+Contact: Anshuman Khandual <anshuman.khandual@arm.com>
+Description:
+ the number of pages CMA API succeeded to release
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon
index bfa5b82..dad4d5f 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-damon
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon
@@ -34,7 +34,9 @@
kdamond. Writing 'update_schemes_tried_bytes' to the file
updates only '.../tried_regions/total_bytes' files of this
kdamond. Writing 'clear_schemes_tried_regions' to the file
- removes contents of the 'tried_regions' directory.
+ removes contents of the 'tried_regions' directory. Writing
+ 'update_schemes_effective_quotas' to the file updates
+ '.../quotas/effective_bytes' files of this kdamond.
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/pid
Date: Mar 2022
@@ -208,6 +210,12 @@
Description: Writing to and reading from this file sets and gets the size
quota of the scheme in bytes.
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/effective_bytes
+Date: Feb 2024
+Contact: SeongJae Park <sj@kernel.org>
+Description: Reading from this file gets the effective size quota of the
+ scheme in bytes, which adjusted for the time quota and goals.
+
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/reset_interval_ms
Date: Mar 2022
Contact: SeongJae Park <sj@kernel.org>
@@ -221,6 +229,12 @@
directories for setting automatic tuning of the scheme's
aggressiveness named '0' to 'N-1' under the goals/ directory.
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/goals/<G>/target_metric
+Date: Feb 2024
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing to and reading from this file sets and gets the quota
+ auto-tuning goal metric.
+
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/goals/<G>/target_value
Date: Nov 2023
Contact: SeongJae Park <sj@kernel.org>
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
new file mode 100644
index 0000000..8ac327f
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
@@ -0,0 +1,4 @@
+What: /sys/kernel/mm/mempolicy/
+Date: January 2024
+Contact: Linux memory management mailing list <linux-mm@kvack.org>
+Description: Interface for Mempolicy
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave
new file mode 100644
index 0000000..0b7972d
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave
@@ -0,0 +1,25 @@
+What: /sys/kernel/mm/mempolicy/weighted_interleave/
+Date: January 2024
+Contact: Linux memory management mailing list <linux-mm@kvack.org>
+Description: Configuration Interface for the Weighted Interleave policy
+
+What: /sys/kernel/mm/mempolicy/weighted_interleave/nodeN
+Date: January 2024
+Contact: Linux memory management mailing list <linux-mm@kvack.org>
+Description: Weight configuration interface for nodeN
+
+ The interleave weight for a memory node (N). These weights are
+ utilized by tasks which have set their mempolicy to
+ MPOL_WEIGHTED_INTERLEAVE.
+
+ These weights only affect new allocations, and changes at runtime
+ will not cause migrations on already allocated pages.
+
+ The minimum weight for a node is always 1.
+
+ Minimum weight: 1
+ Maximum weight: 255
+
+ Writing an empty string or `0` will reset the weight to the
+ system default. The system default may be set by the kernel
+ or drivers at boot or during hotplug events.
diff --git a/Documentation/ABI/testing/sysfs-nvmem-cells b/Documentation/ABI/testing/sysfs-nvmem-cells
index 7af70ad..c7c9444 100644
--- a/Documentation/ABI/testing/sysfs-nvmem-cells
+++ b/Documentation/ABI/testing/sysfs-nvmem-cells
@@ -4,18 +4,18 @@
Contact: Miquel Raynal <miquel.raynal@bootlin.com>
Description:
The "cells" folder contains one file per cell exposed by the
- NVMEM device. The name of the file is: <name>@<where>, with
- <name> being the cell name and <where> its location in the NVMEM
- device, in hexadecimal (without the '0x' prefix, to mimic device
- tree node names). The length of the file is the size of the cell
- (when known). The content of the file is the binary content of
- the cell (may sometimes be ASCII, likely without trailing
- character).
+ NVMEM device. The name of the file is: "<name>@<byte>,<bit>",
+ with <name> being the cell name and <where> its location in
+ the NVMEM device, in hexadecimal bytes and bits (without the
+ '0x' prefix, to mimic device tree node names). The length of
+ the file is the size of the cell (when known). The content of
+ the file is the binary content of the cell (may sometimes be
+ ASCII, likely without trailing character).
Note: This file is only present if CONFIG_NVMEM_SYSFS
is enabled.
Example::
- hexdump -C /sys/bus/nvmem/devices/1-00563/cells/product-name@d
+ hexdump -C /sys/bus/nvmem/devices/1-00563/cells/product-name@d,0
00000000 54 4e 34 38 4d 2d 50 2d 44 4e |TN48M-P-DN|
0000000a
diff --git a/Documentation/Makefile b/Documentation/Makefile
index 3885bbe..b68f8c8 100644
--- a/Documentation/Makefile
+++ b/Documentation/Makefile
@@ -111,7 +111,9 @@
$(YNL_RST_DIR)/%.rst: $(YNL_YAML_DIR)/%.yaml $(YNL_TOOL)
$(Q)$(YNL_TOOL) -i $< -o $@
-htmldocs: $(YNL_INDEX)
+htmldocs texinfodocs latexdocs epubdocs xmldocs: $(YNL_INDEX)
+
+htmldocs:
@$(srctree)/scripts/sphinx-pre-install --version-check
@+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,html,$(var),,$(var)))
@@ -176,6 +178,7 @@
$(Q)cd $(srctree);scripts/documentation-file-ref-check
cleandocs:
+ $(Q)rm -f $(YNL_INDEX) $(YNL_RST_FILES)
$(Q)rm -rf $(BUILDDIR)
$(Q)$(MAKE) BUILDDIR=$(abspath $(BUILDDIR)) $(build)=Documentation/userspace-api/media clean
diff --git a/Documentation/RAS/ras.rst b/Documentation/RAS/ras.rst
deleted file mode 100644
index 2556b39..0000000
--- a/Documentation/RAS/ras.rst
+++ /dev/null
@@ -1,26 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-Reliability, Availability and Serviceability features
-=====================================================
-
-This documents different aspects of the RAS functionality present in the
-kernel.
-
-Error decoding
----------------
-
-* x86
-
-Error decoding on AMD systems should be done using the rasdaemon tool:
-https://github.com/mchehab/rasdaemon/
-
-While the daemon is running, it would automatically log and decode
-errors. If not, one can still decode such errors by supplying the
-hardware information from the error::
-
- $ rasdaemon -p --status <STATUS> --ipid <IPID> --smca
-
-Also, the user can pass particular family and model to decode the error
-string::
-
- $ rasdaemon -p --status <STATUS> --ipid <IPID> --smca --family <CPU Family> --model <CPU Model> --bank <BANK_NUM>
diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst
index 2d42998..3e6407d 100644
--- a/Documentation/RCU/checklist.rst
+++ b/Documentation/RCU/checklist.rst
@@ -68,7 +68,8 @@
rcu_read_lock_sched(), or by the appropriate update-side lock.
Explicit disabling of preemption (preempt_disable(), for example)
can serve as rcu_read_lock_sched(), but is less readable and
- prevents lockdep from detecting locking issues.
+ prevents lockdep from detecting locking issues. Acquiring a
+ spinlock also enters an RCU read-side critical section.
Please note that you *cannot* rely on code known to be built
only in non-preemptible kernels. Such code can and will break,
@@ -382,16 +383,17 @@
must use whatever locking or other synchronization is required
to safely access and/or modify that data structure.
- Do not assume that RCU callbacks will be executed on the same
- CPU that executed the corresponding call_rcu() or call_srcu().
- For example, if a given CPU goes offline while having an RCU
- callback pending, then that RCU callback will execute on some
- surviving CPU. (If this was not the case, a self-spawning RCU
- callback would prevent the victim CPU from ever going offline.)
- Furthermore, CPUs designated by rcu_nocbs= might well *always*
- have their RCU callbacks executed on some other CPUs, in fact,
- for some real-time workloads, this is the whole point of using
- the rcu_nocbs= kernel boot parameter.
+ Do not assume that RCU callbacks will be executed on
+ the same CPU that executed the corresponding call_rcu(),
+ call_srcu(), call_rcu_tasks(), call_rcu_tasks_rude(), or
+ call_rcu_tasks_trace(). For example, if a given CPU goes offline
+ while having an RCU callback pending, then that RCU callback
+ will execute on some surviving CPU. (If this was not the case,
+ a self-spawning RCU callback would prevent the victim CPU from
+ ever going offline.) Furthermore, CPUs designated by rcu_nocbs=
+ might well *always* have their RCU callbacks executed on some
+ other CPUs, in fact, for some real-time workloads, this is the
+ whole point of using the rcu_nocbs= kernel boot parameter.
In addition, do not assume that callbacks queued in a given order
will be invoked in that order, even if they all are queued on the
@@ -444,7 +446,7 @@
real-time workloads than is synchronize_rcu_expedited().
It is also permissible to sleep in RCU Tasks Trace read-side
- critical, which are delimited by rcu_read_lock_trace() and
+ critical section, which are delimited by rcu_read_lock_trace() and
rcu_read_unlock_trace(). However, this is a specialized flavor
of RCU, and you should not use it without first checking with
its current users. In most cases, you should instead use SRCU.
@@ -490,6 +492,12 @@
since the last time that you passed that same object to
call_rcu() (or friends).
+ CONFIG_RCU_STRICT_GRACE_PERIOD:
+ combine with KASAN to check for pointers leaked out
+ of RCU read-side critical sections. This Kconfig
+ option is tough on both performance and scalability,
+ and so is limited to four-CPU systems.
+
__rcu sparse checks:
tag the pointer to the RCU-protected data structure
with __rcu, and sparse will warn you if you access that
diff --git a/Documentation/RCU/rcu_dereference.rst b/Documentation/RCU/rcu_dereference.rst
index 659d591..2524dcd 100644
--- a/Documentation/RCU/rcu_dereference.rst
+++ b/Documentation/RCU/rcu_dereference.rst
@@ -408,7 +408,10 @@
RCU flavors, an RCU read-side critical section is entered
using rcu_read_lock(), anything that disables bottom halves,
anything that disables interrupts, or anything that disables
- preemption.
+ preemption. Please note that spinlock critical sections
+ are also implied RCU read-side critical sections, even when
+ they are preemptible, as they are in kernels built with
+ CONFIG_PREEMPT_RT=y.
2. If the access might be within an RCU read-side critical section
on the one hand, or protected by (say) my_lock on the other,
diff --git a/Documentation/RCU/torture.rst b/Documentation/RCU/torture.rst
index 49e7bee..4b1f99c 100644
--- a/Documentation/RCU/torture.rst
+++ b/Documentation/RCU/torture.rst
@@ -318,7 +318,7 @@
tools/testing/selftests/rcutorture/res/2022.11.03-11.26.28
-Then this run can be re-run without rebuilding as follow:
+Then this run can be re-run without rebuilding as follow::
kvm-again.sh tools/testing/selftests/rcutorture/res/2022.11.03-11.26.28
diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst
index 60ce024..872ac66 100644
--- a/Documentation/RCU/whatisRCU.rst
+++ b/Documentation/RCU/whatisRCU.rst
@@ -172,14 +172,25 @@
critical section. Reference counts may be used in conjunction
with RCU to maintain longer-term references to data structures.
+ Note that anything that disables bottom halves, preemption,
+ or interrupts also enters an RCU read-side critical section.
+ Acquiring a spinlock also enters an RCU read-side critical
+ sections, even for spinlocks that do not disable preemption,
+ as is the case in kernels built with CONFIG_PREEMPT_RT=y.
+ Sleeplocks do *not* enter RCU read-side critical sections.
+
rcu_read_unlock()
^^^^^^^^^^^^^^^^^
void rcu_read_unlock(void);
This temporal primitives is used by a reader to inform the
reclaimer that the reader is exiting an RCU read-side critical
- section. Note that RCU read-side critical sections may be nested
- and/or overlapping.
+ section. Anything that enables bottom halves, preemption,
+ or interrupts also exits an RCU read-side critical section.
+ Releasing a spinlock also exits an RCU read-side critical section.
+
+ Note that RCU read-side critical sections may be nested and/or
+ overlapping.
synchronize_rcu()
^^^^^^^^^^^^^^^^^
@@ -952,8 +963,8 @@
initialized after each and every call to kmem_cache_alloc(), which renders
reference-free spinlock acquisition completely unsafe. Therefore, when
using ``SLAB_TYPESAFE_BY_RCU``, make proper use of a reference counter.
-(Those willing to use a kmem_cache constructor may also use locking,
-including cache-friendly sequence locking.)
+(Those willing to initialize their locks in a kmem_cache constructor
+may also use locking, including cache-friendly sequence locking.)
With traditional reference counting -- such as that implemented by the
kref library in Linux -- there is typically code that runs when the last
diff --git a/Documentation/admin-guide/RAS/address-translation.rst b/Documentation/admin-guide/RAS/address-translation.rst
new file mode 100644
index 0000000..f0ca17b
--- /dev/null
+++ b/Documentation/admin-guide/RAS/address-translation.rst
@@ -0,0 +1,24 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Address translation
+===================
+
+x86 AMD
+-------
+
+Zen-based AMD systems include a Data Fabric that manages the layout of
+physical memory. Devices attached to the Fabric, like memory controllers,
+I/O, etc., may not have a complete view of the system physical memory map.
+These devices may provide a "normalized", i.e. device physical, address
+when reporting memory errors. Normalized addresses must be translated to
+a system physical address for the kernel to action on the memory.
+
+AMD Address Translation Library (CONFIG_AMD_ATL) provides translation for
+this case.
+
+Glossary of acronyms used in address translation for Zen-based systems
+
+* CCM = Cache Coherent Moderator
+* COD = Cluster-on-Die
+* COH_ST = Coherent Station
+* DF = Data Fabric
diff --git a/Documentation/admin-guide/RAS/error-decoding.rst b/Documentation/admin-guide/RAS/error-decoding.rst
new file mode 100644
index 0000000..26a72f3
--- /dev/null
+++ b/Documentation/admin-guide/RAS/error-decoding.rst
@@ -0,0 +1,21 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Error decoding
+==============
+
+x86
+---
+
+Error decoding on AMD systems should be done using the rasdaemon tool:
+https://github.com/mchehab/rasdaemon/
+
+While the daemon is running, it would automatically log and decode
+errors. If not, one can still decode such errors by supplying the
+hardware information from the error::
+
+ $ rasdaemon -p --status <STATUS> --ipid <IPID> --smca
+
+Also, the user can pass particular family and model to decode the error
+string::
+
+ $ rasdaemon -p --status <STATUS> --ipid <IPID> --smca --family <CPU Family> --model <CPU Model> --bank <BANK_NUM>
diff --git a/Documentation/admin-guide/RAS/index.rst b/Documentation/admin-guide/RAS/index.rst
new file mode 100644
index 0000000..f408704
--- /dev/null
+++ b/Documentation/admin-guide/RAS/index.rst
@@ -0,0 +1,7 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. toctree::
+ :maxdepth: 2
+
+ main
+ error-decoding
+ address-translation
diff --git a/Documentation/admin-guide/RAS/main.rst b/Documentation/admin-guide/RAS/main.rst
new file mode 100644
index 0000000..7ac1d4c
--- /dev/null
+++ b/Documentation/admin-guide/RAS/main.rst
@@ -0,0 +1,1223 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: <isonum.txt>
+
+==================================================
+Reliability, Availability and Serviceability (RAS)
+==================================================
+
+This documents different aspects of the RAS functionality present in the
+kernel.
+
+RAS concepts
+************
+
+Reliability, Availability and Serviceability (RAS) is a concept used on
+servers meant to measure their robustness.
+
+Reliability
+ is the probability that a system will produce correct outputs.
+
+ * Generally measured as Mean Time Between Failures (MTBF)
+ * Enhanced by features that help to avoid, detect and repair hardware faults
+
+Availability
+ is the probability that a system is operational at a given time
+
+ * Generally measured as a percentage of downtime per a period of time
+ * Often uses mechanisms to detect and correct hardware faults in
+ runtime;
+
+Serviceability (or maintainability)
+ is the simplicity and speed with which a system can be repaired or
+ maintained
+
+ * Generally measured on Mean Time Between Repair (MTBR)
+
+Improving RAS
+-------------
+
+In order to reduce systems downtime, a system should be capable of detecting
+hardware errors, and, when possible correcting them in runtime. It should
+also provide mechanisms to detect hardware degradation, in order to warn
+the system administrator to take the action of replacing a component before
+it causes data loss or system downtime.
+
+Among the monitoring measures, the most usual ones include:
+
+* CPU – detect errors at instruction execution and at L1/L2/L3 caches;
+* Memory – add error correction logic (ECC) to detect and correct errors;
+* I/O – add CRC checksums for transferred data;
+* Storage – RAID, journal file systems, checksums,
+ Self-Monitoring, Analysis and Reporting Technology (SMART).
+
+By monitoring the number of occurrences of error detections, it is possible
+to identify if the probability of hardware errors is increasing, and, on such
+case, do a preventive maintenance to replace a degraded component while
+those errors are correctable.
+
+Types of errors
+---------------
+
+Most mechanisms used on modern systems use technologies like Hamming
+Codes that allow error correction when the number of errors on a bit packet
+is below a threshold. If the number of errors is above, those mechanisms
+can indicate with a high degree of confidence that an error happened, but
+they can't correct.
+
+Also, sometimes an error occur on a component that it is not used. For
+example, a part of the memory that it is not currently allocated.
+
+That defines some categories of errors:
+
+* **Correctable Error (CE)** - the error detection mechanism detected and
+ corrected the error. Such errors are usually not fatal, although some
+ Kernel mechanisms allow the system administrator to consider them as fatal.
+
+* **Uncorrected Error (UE)** - the amount of errors happened above the error
+ correction threshold, and the system was unable to auto-correct.
+
+* **Fatal Error** - when an UE error happens on a critical component of the
+ system (for example, a piece of the Kernel got corrupted by an UE), the
+ only reliable way to avoid data corruption is to hang or reboot the machine.
+
+* **Non-fatal Error** - when an UE error happens on an unused component,
+ like a CPU in power down state or an unused memory bank, the system may
+ still run, eventually replacing the affected hardware by a hot spare,
+ if available.
+
+ Also, when an error happens on a userspace process, it is also possible to
+ kill such process and let userspace restart it.
+
+The mechanism for handling non-fatal errors is usually complex and may
+require the help of some userspace application, in order to apply the
+policy desired by the system administrator.
+
+Identifying a bad hardware component
+------------------------------------
+
+Just detecting a hardware flaw is usually not enough, as the system needs
+to pinpoint to the minimal replaceable unit (MRU) that should be exchanged
+to make the hardware reliable again.
+
+So, it requires not only error logging facilities, but also mechanisms that
+will translate the error message to the silkscreen or component label for
+the MRU.
+
+Typically, it is very complex for memory, as modern CPUs interlace memory
+from different memory modules, in order to provide a better performance. The
+DMI BIOS usually have a list of memory module labels, with can be obtained
+using the ``dmidecode`` tool. For example, on a desktop machine, it shows::
+
+ Memory Device
+ Total Width: 64 bits
+ Data Width: 64 bits
+ Size: 16384 MB
+ Form Factor: SODIMM
+ Set: None
+ Locator: ChannelA-DIMM0
+ Bank Locator: BANK 0
+ Type: DDR4
+ Type Detail: Synchronous
+ Speed: 2133 MHz
+ Rank: 2
+ Configured Clock Speed: 2133 MHz
+
+On the above example, a DDR4 SO-DIMM memory module is located at the
+system's memory labeled as "BANK 0", as given by the *bank locator* field.
+Please notice that, on such system, the *total width* is equal to the
+*data width*. It means that such memory module doesn't have error
+detection/correction mechanisms.
+
+Unfortunately, not all systems use the same field to specify the memory
+bank. On this example, from an older server, ``dmidecode`` shows::
+
+ Memory Device
+ Array Handle: 0x1000
+ Error Information Handle: Not Provided
+ Total Width: 72 bits
+ Data Width: 64 bits
+ Size: 8192 MB
+ Form Factor: DIMM
+ Set: 1
+ Locator: DIMM_A1
+ Bank Locator: Not Specified
+ Type: DDR3
+ Type Detail: Synchronous Registered (Buffered)
+ Speed: 1600 MHz
+ Rank: 2
+ Configured Clock Speed: 1600 MHz
+
+There, the DDR3 RDIMM memory module is located at the system's memory labeled
+as "DIMM_A1", as given by the *locator* field. Please notice that this
+memory module has 64 bits of *data width* and 72 bits of *total width*. So,
+it has 8 extra bits to be used by error detection and correction mechanisms.
+Such kind of memory is called Error-correcting code memory (ECC memory).
+
+To make things even worse, it is not uncommon that systems with different
+labels on their system's board to use exactly the same BIOS, meaning that
+the labels provided by the BIOS won't match the real ones.
+
+ECC memory
+----------
+
+As mentioned in the previous section, ECC memory has extra bits to be
+used for error correction. In the above example, a memory module has
+64 bits of *data width*, and 72 bits of *total width*. The extra 8
+bits which are used for the error detection and correction mechanisms
+are referred to as the *syndrome*\ [#f1]_\ [#f2]_.
+
+So, when the cpu requests the memory controller to write a word with
+*data width*, the memory controller calculates the *syndrome* in real time,
+using Hamming code, or some other error correction code, like SECDED+,
+producing a code with *total width* size. Such code is then written
+on the memory modules.
+
+At read, the *total width* bits code is converted back, using the same
+ECC code used on write, producing a word with *data width* and a *syndrome*.
+The word with *data width* is sent to the CPU, even when errors happen.
+
+The memory controller also looks at the *syndrome* in order to check if
+there was an error, and if the ECC code was able to fix such error.
+If the error was corrected, a Corrected Error (CE) happened. If not, an
+Uncorrected Error (UE) happened.
+
+The information about the CE/UE errors is stored on some special registers
+at the memory controller and can be accessed by reading such registers,
+either by BIOS, by some special CPUs or by Linux EDAC driver. On x86 64
+bit CPUs, such errors can also be retrieved via the Machine Check
+Architecture (MCA)\ [#f3]_.
+
+.. [#f1] Please notice that several memory controllers allow operation on a
+ mode called "Lock-Step", where it groups two memory modules together,
+ doing 128-bit reads/writes. That gives 16 bits for error correction, with
+ significantly improves the error correction mechanism, at the expense
+ that, when an error happens, there's no way to know what memory module is
+ to blame. So, it has to blame both memory modules.
+
+.. [#f2] Some memory controllers also allow using memory in mirror mode.
+ On such mode, the same data is written to two memory modules. At read,
+ the system checks both memory modules, in order to check if both provide
+ identical data. On such configuration, when an error happens, there's no
+ way to know what memory module is to blame. So, it has to blame both
+ memory modules (or 4 memory modules, if the system is also on Lock-step
+ mode).
+
+.. [#f3] For more details about the Machine Check Architecture (MCA),
+ please read Documentation/arch/x86/x86_64/machinecheck.rst at the Kernel tree.
+
+EDAC - Error Detection And Correction
+*************************************
+
+.. note::
+
+ "bluesmoke" was the name for this device driver subsystem when it
+ was "out-of-tree" and maintained at http://bluesmoke.sourceforge.net.
+ That site is mostly archaic now and can be used only for historical
+ purposes.
+
+ When the subsystem was pushed upstream for the first time, on
+ Kernel 2.6.16, it was renamed to ``EDAC``.
+
+Purpose
+-------
+
+The ``edac`` kernel module's goal is to detect and report hardware errors
+that occur within the computer system running under linux.
+
+Memory
+------
+
+Memory Correctable Errors (CE) and Uncorrectable Errors (UE) are the
+primary errors being harvested. These types of errors are harvested by
+the ``edac_mc`` device.
+
+Detecting CE events, then harvesting those events and reporting them,
+**can** but must not necessarily be a predictor of future UE events. With
+CE events only, the system can and will continue to operate as no data
+has been damaged yet.
+
+However, preventive maintenance and proactive part replacement of memory
+modules exhibiting CEs can reduce the likelihood of the dreaded UE events
+and system panics.
+
+Other hardware elements
+-----------------------
+
+A new feature for EDAC, the ``edac_device`` class of device, was added in
+the 2.6.23 version of the kernel.
+
+This new device type allows for non-memory type of ECC hardware detectors
+to have their states harvested and presented to userspace via the sysfs
+interface.
+
+Some architectures have ECC detectors for L1, L2 and L3 caches,
+along with DMA engines, fabric switches, main data path switches,
+interconnections, and various other hardware data paths. If the hardware
+reports it, then a edac_device device probably can be constructed to
+harvest and present that to userspace.
+
+
+PCI bus scanning
+----------------
+
+In addition, PCI devices are scanned for PCI Bus Parity and SERR Errors
+in order to determine if errors are occurring during data transfers.
+
+The presence of PCI Parity errors must be examined with a grain of salt.
+There are several add-in adapters that do **not** follow the PCI specification
+with regards to Parity generation and reporting. The specification says
+the vendor should tie the parity status bits to 0 if they do not intend
+to generate parity. Some vendors do not do this, and thus the parity bit
+can "float" giving false positives.
+
+There is a PCI device attribute located in sysfs that is checked by
+the EDAC PCI scanning code. If that attribute is set, PCI parity/error
+scanning is skipped for that device. The attribute is::
+
+ broken_parity_status
+
+and is located in ``/sys/devices/pci<XXX>/0000:XX:YY.Z`` directories for
+PCI devices.
+
+
+Versioning
+----------
+
+EDAC is composed of a "core" module (``edac_core.ko``) and several Memory
+Controller (MC) driver modules. On a given system, the CORE is loaded
+and one MC driver will be loaded. Both the CORE and the MC driver (or
+``edac_device`` driver) have individual versions that reflect current
+release level of their respective modules.
+
+Thus, to "report" on what version a system is running, one must report
+both the CORE's and the MC driver's versions.
+
+
+Loading
+-------
+
+If ``edac`` was statically linked with the kernel then no loading
+is necessary. If ``edac`` was built as modules then simply modprobe
+the ``edac`` pieces that you need. You should be able to modprobe
+hardware-specific modules and have the dependencies load the necessary
+core modules.
+
+Example::
+
+ $ modprobe amd76x_edac
+
+loads both the ``amd76x_edac.ko`` memory controller module and the
+``edac_mc.ko`` core module.
+
+
+Sysfs interface
+---------------
+
+EDAC presents a ``sysfs`` interface for control and reporting purposes. It
+lives in the /sys/devices/system/edac directory.
+
+Within this directory there currently reside 2 components:
+
+ ======= ==============================
+ mc memory controller(s) system
+ pci PCI control and status system
+ ======= ==============================
+
+
+
+Memory Controller (mc) Model
+----------------------------
+
+Each ``mc`` device controls a set of memory modules [#f4]_. These modules
+are laid out in a Chip-Select Row (``csrowX``) and Channel table (``chX``).
+There can be multiple csrows and multiple channels.
+
+.. [#f4] Nowadays, the term DIMM (Dual In-line Memory Module) is widely
+ used to refer to a memory module, although there are other memory
+ packaging alternatives, like SO-DIMM, SIMM, etc. The UEFI
+ specification (Version 2.7) defines a memory module in the Common
+ Platform Error Record (CPER) section to be an SMBIOS Memory Device
+ (Type 17). Along this document, and inside the EDAC subsystem, the term
+ "dimm" is used for all memory modules, even when they use a
+ different kind of packaging.
+
+Memory controllers allow for several csrows, with 8 csrows being a
+typical value. Yet, the actual number of csrows depends on the layout of
+a given motherboard, memory controller and memory module characteristics.
+
+Dual channels allow for dual data length (e. g. 128 bits, on 64 bit systems)
+data transfers to/from the CPU from/to memory. Some newer chipsets allow
+for more than 2 channels, like Fully Buffered DIMMs (FB-DIMMs) memory
+controllers. The following example will assume 2 channels:
+
+ +------------+-----------------------+
+ | CS Rows | Channels |
+ +------------+-----------+-----------+
+ | | ``ch0`` | ``ch1`` |
+ +============+===========+===========+
+ | |**DIMM_A0**|**DIMM_B0**|
+ +------------+-----------+-----------+
+ | ``csrow0`` | rank0 | rank0 |
+ +------------+-----------+-----------+
+ | ``csrow1`` | rank1 | rank1 |
+ +------------+-----------+-----------+
+ | |**DIMM_A1**|**DIMM_B1**|
+ +------------+-----------+-----------+
+ | ``csrow2`` | rank0 | rank0 |
+ +------------+-----------+-----------+
+ | ``csrow3`` | rank1 | rank1 |
+ +------------+-----------+-----------+
+
+In the above example, there are 4 physical slots on the motherboard
+for memory DIMMs:
+
+ +---------+---------+
+ | DIMM_A0 | DIMM_B0 |
+ +---------+---------+
+ | DIMM_A1 | DIMM_B1 |
+ +---------+---------+
+
+Labels for these slots are usually silk-screened on the motherboard.
+Slots labeled ``A`` are channel 0 in this example. Slots labeled ``B`` are
+channel 1. Notice that there are two csrows possible on a physical DIMM.
+These csrows are allocated their csrow assignment based on the slot into
+which the memory DIMM is placed. Thus, when 1 DIMM is placed in each
+Channel, the csrows cross both DIMMs.
+
+Memory DIMMs come single or dual "ranked". A rank is a populated csrow.
+In the example above 2 dual ranked DIMMs are similarly placed. Thus,
+both csrow0 and csrow1 are populated. On the other hand, when 2 single
+ranked DIMMs are placed in slots DIMM_A0 and DIMM_B0, then they will
+have just one csrow (csrow0) and csrow1 will be empty. The pattern
+repeats itself for csrow2 and csrow3. Also note that some memory
+controllers don't have any logic to identify the memory module, see
+``rankX`` directories below.
+
+The representation of the above is reflected in the directory
+tree in EDAC's sysfs interface. Starting in directory
+``/sys/devices/system/edac/mc``, each memory controller will be
+represented by its own ``mcX`` directory, where ``X`` is the
+index of the MC::
+
+ ..../edac/mc/
+ |
+ |->mc0
+ |->mc1
+ |->mc2
+ ....
+
+Under each ``mcX`` directory each ``csrowX`` is again represented by a
+``csrowX``, where ``X`` is the csrow index::
+
+ .../mc/mc0/
+ |
+ |->csrow0
+ |->csrow2
+ |->csrow3
+ ....
+
+Notice that there is no csrow1, which indicates that csrow0 is composed
+of a single ranked DIMMs. This should also apply in both Channels, in
+order to have dual-channel mode be operational. Since both csrow2 and
+csrow3 are populated, this indicates a dual ranked set of DIMMs for
+channels 0 and 1.
+
+Within each of the ``mcX`` and ``csrowX`` directories are several EDAC
+control and attribute files.
+
+``mcX`` directories
+-------------------
+
+In ``mcX`` directories are EDAC control and attribute files for
+this ``X`` instance of the memory controllers.
+
+For a description of the sysfs API, please see:
+
+ Documentation/ABI/testing/sysfs-devices-edac
+
+
+``dimmX`` or ``rankX`` directories
+----------------------------------
+
+The recommended way to use the EDAC subsystem is to look at the information
+provided by the ``dimmX`` or ``rankX`` directories [#f5]_.
+
+A typical EDAC system has the following structure under
+``/sys/devices/system/edac/``\ [#f6]_::
+
+ /sys/devices/system/edac/
+ ├── mc
+ │ ├── mc0
+ │ │ ├── ce_count
+ │ │ ├── ce_noinfo_count
+ │ │ ├── dimm0
+ │ │ │ ├── dimm_ce_count
+ │ │ │ ├── dimm_dev_type
+ │ │ │ ├── dimm_edac_mode
+ │ │ │ ├── dimm_label
+ │ │ │ ├── dimm_location
+ │ │ │ ├── dimm_mem_type
+ │ │ │ ├── dimm_ue_count
+ │ │ │ ├── size
+ │ │ │ └── uevent
+ │ │ ├── max_location
+ │ │ ├── mc_name
+ │ │ ├── reset_counters
+ │ │ ├── seconds_since_reset
+ │ │ ├── size_mb
+ │ │ ├── ue_count
+ │ │ ├── ue_noinfo_count
+ │ │ └── uevent
+ │ ├── mc1
+ │ │ ├── ce_count
+ │ │ ├── ce_noinfo_count
+ │ │ ├── dimm0
+ │ │ │ ├── dimm_ce_count
+ │ │ │ ├── dimm_dev_type
+ │ │ │ ├── dimm_edac_mode
+ │ │ │ ├── dimm_label
+ │ │ │ ├── dimm_location
+ │ │ │ ├── dimm_mem_type
+ │ │ │ ├── dimm_ue_count
+ │ │ │ ├── size
+ │ │ │ └── uevent
+ │ │ ├── max_location
+ │ │ ├── mc_name
+ │ │ ├── reset_counters
+ │ │ ├── seconds_since_reset
+ │ │ ├── size_mb
+ │ │ ├── ue_count
+ │ │ ├── ue_noinfo_count
+ │ │ └── uevent
+ │ └── uevent
+ └── uevent
+
+In the ``dimmX`` directories are EDAC control and attribute files for
+this ``X`` memory module:
+
+- ``size`` - Total memory managed by this csrow attribute file
+
+ This attribute file displays, in count of megabytes, the memory
+ that this csrow contains.
+
+- ``dimm_ue_count`` - Uncorrectable Errors count attribute file
+
+ This attribute file displays the total count of uncorrectable
+ errors that have occurred on this DIMM. If panic_on_ue is set
+ this counter will not have a chance to increment, since EDAC
+ will panic the system.
+
+- ``dimm_ce_count`` - Correctable Errors count attribute file
+
+ This attribute file displays the total count of correctable
+ errors that have occurred on this DIMM. This count is very
+ important to examine. CEs provide early indications that a
+ DIMM is beginning to fail. This count field should be
+ monitored for non-zero values and report such information
+ to the system administrator.
+
+- ``dimm_dev_type`` - Device type attribute file
+
+ This attribute file will display what type of DRAM device is
+ being utilized on this DIMM.
+ Examples:
+
+ - x1
+ - x2
+ - x4
+ - x8
+
+- ``dimm_edac_mode`` - EDAC Mode of operation attribute file
+
+ This attribute file will display what type of Error detection
+ and correction is being utilized.
+
+- ``dimm_label`` - memory module label control file
+
+ This control file allows this DIMM to have a label assigned
+ to it. With this label in the module, when errors occur
+ the output can provide the DIMM label in the system log.
+ This becomes vital for panic events to isolate the
+ cause of the UE event.
+
+ DIMM Labels must be assigned after booting, with information
+ that correctly identifies the physical slot with its
+ silk screen label. This information is currently very
+ motherboard specific and determination of this information
+ must occur in userland at this time.
+
+- ``dimm_location`` - location of the memory module
+
+ The location can have up to 3 levels, and describe how the
+ memory controller identifies the location of a memory module.
+ Depending on the type of memory and memory controller, it
+ can be:
+
+ - *csrow* and *channel* - used when the memory controller
+ doesn't identify a single DIMM - e. g. in ``rankX`` dir;
+ - *branch*, *channel*, *slot* - typically used on FB-DIMM memory
+ controllers;
+ - *channel*, *slot* - used on Nehalem and newer Intel drivers.
+
+- ``dimm_mem_type`` - Memory Type attribute file
+
+ This attribute file will display what type of memory is currently
+ on this csrow. Normally, either buffered or unbuffered memory.
+ Examples:
+
+ - Registered-DDR
+ - Unbuffered-DDR
+
+.. [#f5] On some systems, the memory controller doesn't have any logic
+ to identify the memory module. On such systems, the directory is called ``rankX`` and works on a similar way as the ``csrowX`` directories.
+ On modern Intel memory controllers, the memory controller identifies the
+ memory modules directly. On such systems, the directory is called ``dimmX``.
+
+.. [#f6] There are also some ``power`` directories and ``subsystem``
+ symlinks inside the sysfs mapping that are automatically created by
+ the sysfs subsystem. Currently, they serve no purpose.
+
+``csrowX`` directories
+----------------------
+
+When CONFIG_EDAC_LEGACY_SYSFS is enabled, sysfs will contain the ``csrowX``
+directories. As this API doesn't work properly for Rambus, FB-DIMMs and
+modern Intel Memory Controllers, this is being deprecated in favor of
+``dimmX`` directories.
+
+In the ``csrowX`` directories are EDAC control and attribute files for
+this ``X`` instance of csrow:
+
+
+- ``ue_count`` - Total Uncorrectable Errors count attribute file
+
+ This attribute file displays the total count of uncorrectable
+ errors that have occurred on this csrow. If panic_on_ue is set
+ this counter will not have a chance to increment, since EDAC
+ will panic the system.
+
+
+- ``ce_count`` - Total Correctable Errors count attribute file
+
+ This attribute file displays the total count of correctable
+ errors that have occurred on this csrow. This count is very
+ important to examine. CEs provide early indications that a
+ DIMM is beginning to fail. This count field should be
+ monitored for non-zero values and report such information
+ to the system administrator.
+
+
+- ``size_mb`` - Total memory managed by this csrow attribute file
+
+ This attribute file displays, in count of megabytes, the memory
+ that this csrow contains.
+
+
+- ``mem_type`` - Memory Type attribute file
+
+ This attribute file will display what type of memory is currently
+ on this csrow. Normally, either buffered or unbuffered memory.
+ Examples:
+
+ - Registered-DDR
+ - Unbuffered-DDR
+
+
+- ``edac_mode`` - EDAC Mode of operation attribute file
+
+ This attribute file will display what type of Error detection
+ and correction is being utilized.
+
+
+- ``dev_type`` - Device type attribute file
+
+ This attribute file will display what type of DRAM device is
+ being utilized on this DIMM.
+ Examples:
+
+ - x1
+ - x2
+ - x4
+ - x8
+
+
+- ``ch0_ce_count`` - Channel 0 CE Count attribute file
+
+ This attribute file will display the count of CEs on this
+ DIMM located in channel 0.
+
+
+- ``ch0_ue_count`` - Channel 0 UE Count attribute file
+
+ This attribute file will display the count of UEs on this
+ DIMM located in channel 0.
+
+
+- ``ch0_dimm_label`` - Channel 0 DIMM Label control file
+
+
+ This control file allows this DIMM to have a label assigned
+ to it. With this label in the module, when errors occur
+ the output can provide the DIMM label in the system log.
+ This becomes vital for panic events to isolate the
+ cause of the UE event.
+
+ DIMM Labels must be assigned after booting, with information
+ that correctly identifies the physical slot with its
+ silk screen label. This information is currently very
+ motherboard specific and determination of this information
+ must occur in userland at this time.
+
+
+- ``ch1_ce_count`` - Channel 1 CE Count attribute file
+
+
+ This attribute file will display the count of CEs on this
+ DIMM located in channel 1.
+
+
+- ``ch1_ue_count`` - Channel 1 UE Count attribute file
+
+
+ This attribute file will display the count of UEs on this
+ DIMM located in channel 0.
+
+
+- ``ch1_dimm_label`` - Channel 1 DIMM Label control file
+
+ This control file allows this DIMM to have a label assigned
+ to it. With this label in the module, when errors occur
+ the output can provide the DIMM label in the system log.
+ This becomes vital for panic events to isolate the
+ cause of the UE event.
+
+ DIMM Labels must be assigned after booting, with information
+ that correctly identifies the physical slot with its
+ silk screen label. This information is currently very
+ motherboard specific and determination of this information
+ must occur in userland at this time.
+
+
+System Logging
+--------------
+
+If logging for UEs and CEs is enabled, then system logs will contain
+information indicating that errors have been detected::
+
+ EDAC MC0: CE page 0x283, offset 0xce0, grain 8, syndrome 0x6ec3, row 0, channel 1 "DIMM_B1": amd76x_edac
+ EDAC MC0: CE page 0x1e5, offset 0xfb0, grain 8, syndrome 0xb741, row 0, channel 1 "DIMM_B1": amd76x_edac
+
+
+The structure of the message is:
+
+ +---------------------------------------+-------------+
+ | Content | Example |
+ +=======================================+=============+
+ | The memory controller | MC0 |
+ +---------------------------------------+-------------+
+ | Error type | CE |
+ +---------------------------------------+-------------+
+ | Memory page | 0x283 |
+ +---------------------------------------+-------------+
+ | Offset in the page | 0xce0 |
+ +---------------------------------------+-------------+
+ | The byte granularity | grain 8 |
+ | or resolution of the error | |
+ +---------------------------------------+-------------+
+ | The error syndrome | 0xb741 |
+ +---------------------------------------+-------------+
+ | Memory row | row 0 |
+ +---------------------------------------+-------------+
+ | Memory channel | channel 1 |
+ +---------------------------------------+-------------+
+ | DIMM label, if set prior | DIMM B1 |
+ +---------------------------------------+-------------+
+ | And then an optional, driver-specific | |
+ | message that may have additional | |
+ | information. | |
+ +---------------------------------------+-------------+
+
+Both UEs and CEs with no info will lack all but memory controller, error
+type, a notice of "no info" and then an optional, driver-specific error
+message.
+
+
+PCI Bus Parity Detection
+------------------------
+
+On Header Type 00 devices, the primary status is looked at for any
+parity error regardless of whether parity is enabled on the device or
+not. (The spec indicates parity is generated in some cases). On Header
+Type 01 bridges, the secondary status register is also looked at to see
+if parity occurred on the bus on the other side of the bridge.
+
+
+Sysfs configuration
+-------------------
+
+Under ``/sys/devices/system/edac/pci`` are control and attribute files as
+follows:
+
+
+- ``check_pci_parity`` - Enable/Disable PCI Parity checking control file
+
+ This control file enables or disables the PCI Bus Parity scanning
+ operation. Writing a 1 to this file enables the scanning. Writing
+ a 0 to this file disables the scanning.
+
+ Enable::
+
+ echo "1" >/sys/devices/system/edac/pci/check_pci_parity
+
+ Disable::
+
+ echo "0" >/sys/devices/system/edac/pci/check_pci_parity
+
+
+- ``pci_parity_count`` - Parity Count
+
+ This attribute file will display the number of parity errors that
+ have been detected.
+
+
+Module parameters
+-----------------
+
+- ``edac_mc_panic_on_ue`` - Panic on UE control file
+
+ An uncorrectable error will cause a machine panic. This is usually
+ desirable. It is a bad idea to continue when an uncorrectable error
+ occurs - it is indeterminate what was uncorrected and the operating
+ system context might be so mangled that continuing will lead to further
+ corruption. If the kernel has MCE configured, then EDAC will never
+ notice the UE.
+
+ LOAD TIME::
+
+ module/kernel parameter: edac_mc_panic_on_ue=[0|1]
+
+ RUN TIME::
+
+ echo "1" > /sys/module/edac_core/parameters/edac_mc_panic_on_ue
+
+
+- ``edac_mc_log_ue`` - Log UE control file
+
+
+ Generate kernel messages describing uncorrectable errors. These errors
+ are reported through the system message log system. UE statistics
+ will be accumulated even when UE logging is disabled.
+
+ LOAD TIME::
+
+ module/kernel parameter: edac_mc_log_ue=[0|1]
+
+ RUN TIME::
+
+ echo "1" > /sys/module/edac_core/parameters/edac_mc_log_ue
+
+
+- ``edac_mc_log_ce`` - Log CE control file
+
+
+ Generate kernel messages describing correctable errors. These
+ errors are reported through the system message log system.
+ CE statistics will be accumulated even when CE logging is disabled.
+
+ LOAD TIME::
+
+ module/kernel parameter: edac_mc_log_ce=[0|1]
+
+ RUN TIME::
+
+ echo "1" > /sys/module/edac_core/parameters/edac_mc_log_ce
+
+
+- ``edac_mc_poll_msec`` - Polling period control file
+
+
+ The time period, in milliseconds, for polling for error information.
+ Too small a value wastes resources. Too large a value might delay
+ necessary handling of errors and might loose valuable information for
+ locating the error. 1000 milliseconds (once each second) is the current
+ default. Systems which require all the bandwidth they can get, may
+ increase this.
+
+ LOAD TIME::
+
+ module/kernel parameter: edac_mc_poll_msec=[0|1]
+
+ RUN TIME::
+
+ echo "1000" > /sys/module/edac_core/parameters/edac_mc_poll_msec
+
+
+- ``panic_on_pci_parity`` - Panic on PCI PARITY Error
+
+
+ This control file enables or disables panicking when a parity
+ error has been detected.
+
+
+ module/kernel parameter::
+
+ edac_panic_on_pci_pe=[0|1]
+
+ Enable::
+
+ echo "1" > /sys/module/edac_core/parameters/edac_panic_on_pci_pe
+
+ Disable::
+
+ echo "0" > /sys/module/edac_core/parameters/edac_panic_on_pci_pe
+
+
+
+EDAC device type
+----------------
+
+In the header file, edac_pci.h, there is a series of edac_device structures
+and APIs for the EDAC_DEVICE.
+
+User space access to an edac_device is through the sysfs interface.
+
+At the location ``/sys/devices/system/edac`` (sysfs) new edac_device devices
+will appear.
+
+There is a three level tree beneath the above ``edac`` directory. For example,
+the ``test_device_edac`` device (found at the http://bluesmoke.sourceforget.net
+website) installs itself as::
+
+ /sys/devices/system/edac/test-instance
+
+in this directory are various controls, a symlink and one or more ``instance``
+directories.
+
+The standard default controls are:
+
+ ============== =======================================================
+ log_ce boolean to log CE events
+ log_ue boolean to log UE events
+ panic_on_ue boolean to ``panic`` the system if an UE is encountered
+ (default off, can be set true via startup script)
+ poll_msec time period between POLL cycles for events
+ ============== =======================================================
+
+The test_device_edac device adds at least one of its own custom control:
+
+ ============== ==================================================
+ test_bits which in the current test driver does nothing but
+ show how it is installed. A ported driver can
+ add one or more such controls and/or attributes
+ for specific uses.
+ One out-of-tree driver uses controls here to allow
+ for ERROR INJECTION operations to hardware
+ injection registers
+ ============== ==================================================
+
+The symlink points to the 'struct dev' that is registered for this edac_device.
+
+Instances
+---------
+
+One or more instance directories are present. For the ``test_device_edac``
+case:
+
+ +----------------+
+ | test-instance0 |
+ +----------------+
+
+
+In this directory there are two default counter attributes, which are totals of
+counter in deeper subdirectories.
+
+ ============== ====================================
+ ce_count total of CE events of subdirectories
+ ue_count total of UE events of subdirectories
+ ============== ====================================
+
+Blocks
+------
+
+At the lowest directory level is the ``block`` directory. There can be 0, 1
+or more blocks specified in each instance:
+
+ +-------------+
+ | test-block0 |
+ +-------------+
+
+In this directory the default attributes are:
+
+ ============== ================================================
+ ce_count which is counter of CE events for this ``block``
+ of hardware being monitored
+ ue_count which is counter of UE events for this ``block``
+ of hardware being monitored
+ ============== ================================================
+
+
+The ``test_device_edac`` device adds 4 attributes and 1 control:
+
+ ================== ====================================================
+ test-block-bits-0 for every POLL cycle this counter
+ is incremented
+ test-block-bits-1 every 10 cycles, this counter is bumped once,
+ and test-block-bits-0 is set to 0
+ test-block-bits-2 every 100 cycles, this counter is bumped once,
+ and test-block-bits-1 is set to 0
+ test-block-bits-3 every 1000 cycles, this counter is bumped once,
+ and test-block-bits-2 is set to 0
+ ================== ====================================================
+
+
+ ================== ====================================================
+ reset-counters writing ANY thing to this control will
+ reset all the above counters.
+ ================== ====================================================
+
+
+Use of the ``test_device_edac`` driver should enable any others to create their own
+unique drivers for their hardware systems.
+
+The ``test_device_edac`` sample driver is located at the
+http://bluesmoke.sourceforge.net project site for EDAC.
+
+
+Usage of EDAC APIs on Nehalem and newer Intel CPUs
+--------------------------------------------------
+
+On older Intel architectures, the memory controller was part of the North
+Bridge chipset. Nehalem, Sandy Bridge, Ivy Bridge, Haswell, Sky Lake and
+newer Intel architectures integrated an enhanced version of the memory
+controller (MC) inside the CPUs.
+
+This chapter will cover the differences of the enhanced memory controllers
+found on newer Intel CPUs, such as ``i7core_edac``, ``sb_edac`` and
+``sbx_edac`` drivers.
+
+.. note::
+
+ The Xeon E7 processor families use a separate chip for the memory
+ controller, called Intel Scalable Memory Buffer. This section doesn't
+ apply for such families.
+
+1) There is one Memory Controller per Quick Patch Interconnect
+ (QPI). At the driver, the term "socket" means one QPI. This is
+ associated with a physical CPU socket.
+
+ Each MC have 3 physical read channels, 3 physical write channels and
+ 3 logic channels. The driver currently sees it as just 3 channels.
+ Each channel can have up to 3 DIMMs.
+
+ The minimum known unity is DIMMs. There are no information about csrows.
+ As EDAC API maps the minimum unity is csrows, the driver sequentially
+ maps channel/DIMM into different csrows.
+
+ For example, supposing the following layout::
+
+ Ch0 phy rd0, wr0 (0x063f4031): 2 ranks, UDIMMs
+ dimm 0 1024 Mb offset: 0, bank: 8, rank: 1, row: 0x4000, col: 0x400
+ dimm 1 1024 Mb offset: 4, bank: 8, rank: 1, row: 0x4000, col: 0x400
+ Ch1 phy rd1, wr1 (0x063f4031): 2 ranks, UDIMMs
+ dimm 0 1024 Mb offset: 0, bank: 8, rank: 1, row: 0x4000, col: 0x400
+ Ch2 phy rd3, wr3 (0x063f4031): 2 ranks, UDIMMs
+ dimm 0 1024 Mb offset: 0, bank: 8, rank: 1, row: 0x4000, col: 0x400
+
+ The driver will map it as::
+
+ csrow0: channel 0, dimm0
+ csrow1: channel 0, dimm1
+ csrow2: channel 1, dimm0
+ csrow3: channel 2, dimm0
+
+ exports one DIMM per csrow.
+
+ Each QPI is exported as a different memory controller.
+
+2) The MC has the ability to inject errors to test drivers. The drivers
+ implement this functionality via some error injection nodes:
+
+ For injecting a memory error, there are some sysfs nodes, under
+ ``/sys/devices/system/edac/mc/mc?/``:
+
+ - ``inject_addrmatch/*``:
+ Controls the error injection mask register. It is possible to specify
+ several characteristics of the address to match an error code::
+
+ dimm = the affected dimm. Numbers are relative to a channel;
+ rank = the memory rank;
+ channel = the channel that will generate an error;
+ bank = the affected bank;
+ page = the page address;
+ column (or col) = the address column.
+
+ each of the above values can be set to "any" to match any valid value.
+
+ At driver init, all values are set to any.
+
+ For example, to generate an error at rank 1 of dimm 2, for any channel,
+ any bank, any page, any column::
+
+ echo 2 >/sys/devices/system/edac/mc/mc0/inject_addrmatch/dimm
+ echo 1 >/sys/devices/system/edac/mc/mc0/inject_addrmatch/rank
+
+ To return to the default behaviour of matching any, you can do::
+
+ echo any >/sys/devices/system/edac/mc/mc0/inject_addrmatch/dimm
+ echo any >/sys/devices/system/edac/mc/mc0/inject_addrmatch/rank
+
+ - ``inject_eccmask``:
+ specifies what bits will have troubles,
+
+ - ``inject_section``:
+ specifies what ECC cache section will get the error::
+
+ 3 for both
+ 2 for the highest
+ 1 for the lowest
+
+ - ``inject_type``:
+ specifies the type of error, being a combination of the following bits::
+
+ bit 0 - repeat
+ bit 1 - ecc
+ bit 2 - parity
+
+ - ``inject_enable``:
+ starts the error generation when something different than 0 is written.
+
+ All inject vars can be read. root permission is needed for write.
+
+ Datasheet states that the error will only be generated after a write on an
+ address that matches inject_addrmatch. It seems, however, that reading will
+ also produce an error.
+
+ For example, the following code will generate an error for any write access
+ at socket 0, on any DIMM/address on channel 2::
+
+ echo 2 >/sys/devices/system/edac/mc/mc0/inject_addrmatch/channel
+ echo 2 >/sys/devices/system/edac/mc/mc0/inject_type
+ echo 64 >/sys/devices/system/edac/mc/mc0/inject_eccmask
+ echo 3 >/sys/devices/system/edac/mc/mc0/inject_section
+ echo 1 >/sys/devices/system/edac/mc/mc0/inject_enable
+ dd if=/dev/mem of=/dev/null seek=16k bs=4k count=1 >& /dev/null
+
+ For socket 1, it is needed to replace "mc0" by "mc1" at the above
+ commands.
+
+ The generated error message will look like::
+
+ EDAC MC0: UE row 0, channel-a= 0 channel-b= 0 labels "-": NON_FATAL (addr = 0x0075b980, socket=0, Dimm=0, Channel=2, syndrome=0x00000040, count=1, Err=8c0000400001009f:4000080482 (read error: read ECC error))
+
+3) Corrected Error memory register counters
+
+ Those newer MCs have some registers to count memory errors. The driver
+ uses those registers to report Corrected Errors on devices with Registered
+ DIMMs.
+
+ However, those counters don't work with Unregistered DIMM. As the chipset
+ offers some counters that also work with UDIMMs (but with a worse level of
+ granularity than the default ones), the driver exposes those registers for
+ UDIMM memories.
+
+ They can be read by looking at the contents of ``all_channel_counts/``::
+
+ $ for i in /sys/devices/system/edac/mc/mc0/all_channel_counts/*; do echo $i; cat $i; done
+ /sys/devices/system/edac/mc/mc0/all_channel_counts/udimm0
+ 0
+ /sys/devices/system/edac/mc/mc0/all_channel_counts/udimm1
+ 0
+ /sys/devices/system/edac/mc/mc0/all_channel_counts/udimm2
+ 0
+
+ What happens here is that errors on different csrows, but at the same
+ dimm number will increment the same counter.
+ So, in this memory mapping::
+
+ csrow0: channel 0, dimm0
+ csrow1: channel 0, dimm1
+ csrow2: channel 1, dimm0
+ csrow3: channel 2, dimm0
+
+ The hardware will increment udimm0 for an error at the first dimm at either
+ csrow0, csrow2 or csrow3;
+
+ The hardware will increment udimm1 for an error at the second dimm at either
+ csrow0, csrow2 or csrow3;
+
+ The hardware will increment udimm2 for an error at the third dimm at either
+ csrow0, csrow2 or csrow3;
+
+4) Standard error counters
+
+ The standard error counters are generated when an mcelog error is received
+ by the driver. Since, with UDIMM, this is counted by software, it is
+ possible that some errors could be lost. With RDIMM's, they display the
+ contents of the registers
+
+Reference documents used on ``amd64_edac``
+------------------------------------------
+
+``amd64_edac`` module is based on the following documents
+(available from http://support.amd.com/en-us/search/tech-docs):
+
+1. :Title: BIOS and Kernel Developer's Guide for AMD Athlon 64 and AMD
+ Opteron Processors
+ :AMD publication #: 26094
+ :Revision: 3.26
+ :Link: http://support.amd.com/TechDocs/26094.PDF
+
+2. :Title: BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh
+ Processors
+ :AMD publication #: 32559
+ :Revision: 3.00
+ :Issue Date: May 2006
+ :Link: http://support.amd.com/TechDocs/32559.pdf
+
+3. :Title: BIOS and Kernel Developer's Guide (BKDG) For AMD Family 10h
+ Processors
+ :AMD publication #: 31116
+ :Revision: 3.00
+ :Issue Date: September 07, 2007
+ :Link: http://support.amd.com/TechDocs/31116.pdf
+
+4. :Title: BIOS and Kernel Developer's Guide (BKDG) for AMD Family 15h
+ Models 30h-3Fh Processors
+ :AMD publication #: 49125
+ :Revision: 3.06
+ :Issue Date: 2/12/2015 (latest release)
+ :Link: http://support.amd.com/TechDocs/49125_15h_Models_30h-3Fh_BKDG.pdf
+
+5. :Title: BIOS and Kernel Developer's Guide (BKDG) for AMD Family 15h
+ Models 60h-6Fh Processors
+ :AMD publication #: 50742
+ :Revision: 3.01
+ :Issue Date: 7/23/2015 (latest release)
+ :Link: http://support.amd.com/TechDocs/50742_15h_Models_60h-6Fh_BKDG.pdf
+
+6. :Title: BIOS and Kernel Developer's Guide (BKDG) for AMD Family 16h
+ Models 00h-0Fh Processors
+ :AMD publication #: 48751
+ :Revision: 3.03
+ :Issue Date: 2/23/2015 (latest release)
+ :Link: http://support.amd.com/TechDocs/48751_16h_bkdg.pdf
+
+Credits
+=======
+
+* Written by Doug Thompson <dougthompson@xmission.com>
+
+ - 7 Dec 2005
+ - 17 Jul 2007 Updated
+
+* |copy| Mauro Carvalho Chehab
+
+ - 05 Aug 2009 Nehalem interface
+ - 26 Oct 2016 Converted to ReST and cleanups at the Nehalem section
+
+* EDAC authors/maintainers:
+
+ - Doug Thompson, Dave Jiang, Dave Peterson et al,
+ - Mauro Carvalho Chehab
+ - Borislav Petkov
+ - original author: Thayne Harbaugh
diff --git a/Documentation/admin-guide/README.rst b/Documentation/admin-guide/README.rst
index 9a969c0..f2bebff 100644
--- a/Documentation/admin-guide/README.rst
+++ b/Documentation/admin-guide/README.rst
@@ -262,9 +262,11 @@
- Make sure you have at least gcc 5.1 available.
For more information, refer to :ref:`Documentation/process/changes.rst <changes>`.
- - Do a ``make`` to create a compressed kernel image. It is also
- possible to do ``make install`` if you have lilo installed to suit the
- kernel makefiles, but you may want to check your particular lilo setup first.
+ - Do a ``make`` to create a compressed kernel image. It is also possible to do
+ ``make install`` if you have lilo installed or if your distribution has an
+ install script recognised by the kernel's installer. Most popular
+ distributions will have a recognized install script. You may want to
+ check your distribution's setup first.
To do the actual install, you have to be root, but none of the normal
build should require that. Don't take the name of root in vain.
@@ -301,32 +303,51 @@
image (e.g. .../linux/arch/x86/boot/bzImage after compilation)
to the place where your regular bootable kernel is found.
- - Booting a kernel directly from a floppy without the assistance of a
- bootloader such as LILO, is no longer supported.
+ - Booting a kernel directly from a storage device without the assistance
+ of a bootloader such as LILO or GRUB, is no longer supported in BIOS
+ (non-EFI systems). On UEFI/EFI systems, however, you can use EFISTUB
+ which allows the motherboard to boot directly to the kernel.
+ On modern workstations and desktops, it's generally recommended to use a
+ bootloader as difficulties can arise with multiple kernels and secure boot.
+ For more details on EFISTUB,
+ see "Documentation/admin-guide/efi-stub.rst".
- If you boot Linux from the hard drive, chances are you use LILO, which
- uses the kernel image as specified in the file /etc/lilo.conf. The
- kernel image file is usually /vmlinuz, /boot/vmlinuz, /bzImage or
- /boot/bzImage. To use the new kernel, save a copy of the old image
- and copy the new image over the old one. Then, you MUST RERUN LILO
- to update the loading map! If you don't, you won't be able to boot
- the new kernel image.
+ - It's important to note that as of 2016 LILO (LInux LOader) is no longer in
+ active development, though as it was extremely popular, it often comes up
+ in documentation. Popular alternatives include GRUB2, rEFInd, Syslinux,
+ systemd-boot, or EFISTUB. For various reasons, it's not recommended to use
+ software that's no longer in active development.
- Reinstalling LILO is usually a matter of running /sbin/lilo.
- You may wish to edit /etc/lilo.conf to specify an entry for your
- old kernel image (say, /vmlinux.old) in case the new one does not
- work. See the LILO docs for more information.
+ - Chances are your distribution includes an install script and running
+ ``make install`` will be all that's needed. Should that not be the case
+ you'll have to identify your bootloader and reference its documentation or
+ configure your EFI.
- After reinstalling LILO, you should be all set. Shutdown the system,
+Legacy LILO Instructions
+------------------------
+
+
+ - If you use LILO the kernel images are specified in the file /etc/lilo.conf.
+ The kernel image file is usually /vmlinuz, /boot/vmlinuz, /bzImage or
+ /boot/bzImage. To use the new kernel, save a copy of the old image and copy
+ the new image over the old one. Then, you MUST RERUN LILO to update the
+ loading map! If you don't, you won't be able to boot the new kernel image.
+
+ - Reinstalling LILO is usually a matter of running /sbin/lilo. You may wish
+ to edit /etc/lilo.conf to specify an entry for your old kernel image
+ (say, /vmlinux.old) in case the new one does not work. See the LILO docs
+ for more information.
+
+ - After reinstalling LILO, you should be all set. Shutdown the system,
reboot, and enjoy!
- If you ever need to change the default root device, video mode,
- etc. in the kernel image, use your bootloader's boot options
- where appropriate. No need to recompile the kernel to change
- these parameters.
+ - If you ever need to change the default root device, video mode, etc. in the
+ kernel image, use your bootloader's boot options where appropriate. No need
+ to recompile the kernel to change these parameters.
- Reboot with the new kernel and enjoy.
+
If something goes wrong
-----------------------
diff --git a/Documentation/admin-guide/cgroup-v1/cpusets.rst b/Documentation/admin-guide/cgroup-v1/cpusets.rst
index ae646d6..7d3415e 100644
--- a/Documentation/admin-guide/cgroup-v1/cpusets.rst
+++ b/Documentation/admin-guide/cgroup-v1/cpusets.rst
@@ -179,7 +179,7 @@
- cpuset.mem_hardwall flag: is memory allocation hardwalled
- cpuset.memory_pressure: measure of how much paging pressure in cpuset
- cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes
- - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes
+ - cpuset.memory_spread_slab flag: OBSOLETE. Doesn't have any function.
- cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset
- cpuset.sched_relax_domain_level: the searching range when migrating tasks
diff --git a/Documentation/admin-guide/cgroup-v1/hugetlb.rst b/Documentation/admin-guide/cgroup-v1/hugetlb.rst
index 0fa724d..493a8e3 100644
--- a/Documentation/admin-guide/cgroup-v1/hugetlb.rst
+++ b/Documentation/admin-guide/cgroup-v1/hugetlb.rst
@@ -65,10 +65,12 @@
1. Page fault accounting
-hugetlb.<hugepagesize>.limit_in_bytes
-hugetlb.<hugepagesize>.max_usage_in_bytes
-hugetlb.<hugepagesize>.usage_in_bytes
-hugetlb.<hugepagesize>.failcnt
+::
+
+ hugetlb.<hugepagesize>.limit_in_bytes
+ hugetlb.<hugepagesize>.max_usage_in_bytes
+ hugetlb.<hugepagesize>.usage_in_bytes
+ hugetlb.<hugepagesize>.failcnt
The HugeTLB controller allows users to limit the HugeTLB usage (page fault) per
control group and enforces the limit during page fault. Since HugeTLB
@@ -82,10 +84,12 @@
2. Reservation accounting
-hugetlb.<hugepagesize>.rsvd.limit_in_bytes
-hugetlb.<hugepagesize>.rsvd.max_usage_in_bytes
-hugetlb.<hugepagesize>.rsvd.usage_in_bytes
-hugetlb.<hugepagesize>.rsvd.failcnt
+::
+
+ hugetlb.<hugepagesize>.rsvd.limit_in_bytes
+ hugetlb.<hugepagesize>.rsvd.max_usage_in_bytes
+ hugetlb.<hugepagesize>.rsvd.usage_in_bytes
+ hugetlb.<hugepagesize>.rsvd.failcnt
The HugeTLB controller allows to limit the HugeTLB reservations per control
group and enforces the controller limit at reservation time and at the fault of
diff --git a/Documentation/admin-guide/cifs/introduction.rst b/Documentation/admin-guide/cifs/introduction.rst
index 53ea629..ffc6e25 100644
--- a/Documentation/admin-guide/cifs/introduction.rst
+++ b/Documentation/admin-guide/cifs/introduction.rst
@@ -28,7 +28,7 @@
high performance safe distributed caching (leases/oplocks), optional packet
signing, large files, Unicode support and other internationalization
improvements. Since both Samba server and this filesystem client support the
- CIFS Unix extensions, and the Linux client also suppors SMB3 POSIX extensions,
+ CIFS Unix extensions, and the Linux client also supports SMB3 POSIX extensions,
the combination can provide a reasonable alternative to other network and
cluster file systems for fileserving in some Linux to Linux environments,
not just in Linux to Windows (or Linux to Mac) environments.
diff --git a/Documentation/admin-guide/device-mapper/index.rst b/Documentation/admin-guide/device-mapper/index.rst
index cde52cc..cc5aec8 100644
--- a/Documentation/admin-guide/device-mapper/index.rst
+++ b/Documentation/admin-guide/device-mapper/index.rst
@@ -34,6 +34,8 @@
switch
thin-provisioning
unstriped
+ vdo-design
+ vdo
verity
writecache
zero
diff --git a/Documentation/admin-guide/device-mapper/vdo-design.rst b/Documentation/admin-guide/device-mapper/vdo-design.rst
new file mode 100644
index 0000000..3cd59de
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/vdo-design.rst
@@ -0,0 +1,633 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+
+================
+Design of dm-vdo
+================
+
+The dm-vdo (virtual data optimizer) target provides inline deduplication,
+compression, zero-block elimination, and thin provisioning. A dm-vdo target
+can be backed by up to 256TB of storage, and can present a logical size of
+up to 4PB. This target was originally developed at Permabit Technology
+Corp. starting in 2009. It was first released in 2013 and has been used in
+production environments ever since. It was made open-source in 2017 after
+Permabit was acquired by Red Hat. This document describes the design of
+dm-vdo. For usage, see vdo.rst in the same directory as this file.
+
+Because deduplication rates fall drastically as the block size increases, a
+vdo target has a maximum block size of 4K. However, it can achieve
+deduplication rates of 254:1, i.e. up to 254 copies of a given 4K block can
+reference a single 4K of actual storage. It can achieve compression rates
+of 14:1. All zero blocks consume no storage at all.
+
+Theory of Operation
+===================
+
+The design of dm-vdo is based on the idea that deduplication is a two-part
+problem. The first is to recognize duplicate data. The second is to avoid
+storing multiple copies of those duplicates. Therefore, dm-vdo has two main
+parts: a deduplication index (called UDS) that is used to discover
+duplicate data, and a data store with a reference counted block map that
+maps from logical block addresses to the actual storage location of the
+data.
+
+Zones and Threading
+-------------------
+
+Due to the complexity of data optimization, the number of metadata
+structures involved in a single write operation to a vdo target is larger
+than most other targets. Furthermore, because vdo must operate on small
+block sizes in order to achieve good deduplication rates, acceptable
+performance can only be achieved through parallelism. Therefore, vdo's
+design attempts to be lock-free.
+
+Most of a vdo's main data structures are designed to be easily divided into
+"zones" such that any given bio must only access a single zone of any zoned
+structure. Safety with minimal locking is achieved by ensuring that during
+normal operation, each zone is assigned to a specific thread, and only that
+thread will access the portion of the data structure in that zone.
+Associated with each thread is a work queue. Each bio is associated with a
+request object (the "data_vio") which will be added to a work queue when
+the next phase of its operation requires access to the structures in the
+zone associated with that queue.
+
+Another way of thinking about this arrangement is that the work queue for
+each zone has an implicit lock on the structures it manages for all its
+operations, because vdo guarantees that no other thread will alter those
+structures.
+
+Although each structure is divided into zones, this division is not
+reflected in the on-disk representation of each data structure. Therefore,
+the number of zones for each structure, and hence the number of threads,
+can be reconfigured each time a vdo target is started.
+
+The Deduplication Index
+-----------------------
+
+In order to identify duplicate data efficiently, vdo was designed to
+leverage some common characteristics of duplicate data. From empirical
+observations, we gathered two key insights. The first is that in most data
+sets with significant amounts of duplicate data, the duplicates tend to
+have temporal locality. When a duplicate appears, it is more likely that
+other duplicates will be detected, and that those duplicates will have been
+written at about the same time. This is why the index keeps records in
+temporal order. The second insight is that new data is more likely to
+duplicate recent data than it is to duplicate older data and in general,
+there are diminishing returns to looking further back in time. Therefore,
+when the index is full, it should cull its oldest records to make space for
+new ones. Another important idea behind the design of the index is that the
+ultimate goal of deduplication is to reduce storage costs. Since there is a
+trade-off between the storage saved and the resources expended to achieve
+those savings, vdo does not attempt to find every last duplicate block. It
+is sufficient to find and eliminate most of the redundancy.
+
+Each block of data is hashed to produce a 16-byte block name. An index
+record consists of this block name paired with the presumed location of
+that data on the underlying storage. However, it is not possible to
+guarantee that the index is accurate. In the most common case, this occurs
+because it is too costly to update the index when a block is over-written
+or discarded. Doing so would require either storing the block name along
+with the blocks, which is difficult to do efficiently in block-based
+storage, or reading and rehashing each block before overwriting it.
+Inaccuracy can also result from a hash collision where two different blocks
+have the same name. In practice, this is extremely unlikely, but because
+vdo does not use a cryptographic hash, a malicious workload could be
+constructed. Because of these inaccuracies, vdo treats the locations in the
+index as hints, and reads each indicated block to verify that it is indeed
+a duplicate before sharing the existing block with a new one.
+
+Records are collected into groups called chapters. New records are added to
+the newest chapter, called the open chapter. This chapter is stored in a
+format optimized for adding and modifying records, and the content of the
+open chapter is not finalized until it runs out of space for new records.
+When the open chapter fills up, it is closed and a new open chapter is
+created to collect new records.
+
+Closing a chapter converts it to a different format which is optimized for
+reading. The records are written to a series of record pages based on the
+order in which they were received. This means that records with temporal
+locality should be on a small number of pages, reducing the I/O required to
+retrieve them. The chapter also compiles an index that indicates which
+record page contains any given name. This index means that a request for a
+name can determine exactly which record page may contain that record,
+without having to load the entire chapter from storage. This index uses
+only a subset of the block name as its key, so it cannot guarantee that an
+index entry refers to the desired block name. It can only guarantee that if
+there is a record for this name, it will be on the indicated page. Closed
+chapters are read-only structures and their contents are never altered in
+any way.
+
+Once enough records have been written to fill up all the available index
+space, the oldest chapter is removed to make space for new chapters. Any
+time a request finds a matching record in the index, that record is copied
+into the open chapter. This ensures that useful block names remain available
+in the index, while unreferenced block names are forgotten over time.
+
+In order to find records in older chapters, the index also maintains a
+higher level structure called the volume index, which contains entries
+mapping each block name to the chapter containing its newest record. This
+mapping is updated as records for the block name are copied or updated,
+ensuring that only the newest record for a given block name can be found.
+An older record for a block name will no longer be found even though it has
+not been deleted from its chapter. Like the chapter index, the volume index
+uses only a subset of the block name as its key and can not definitively
+say that a record exists for a name. It can only say which chapter would
+contain the record if a record exists. The volume index is stored entirely
+in memory and is saved to storage only when the vdo target is shut down.
+
+From the viewpoint of a request for a particular block name, it will first
+look up the name in the volume index. This search will either indicate that
+the name is new, or which chapter to search. If it returns a chapter, the
+request looks up its name in the chapter index. This will indicate either
+that the name is new, or which record page to search. Finally, if it is not
+new, the request will look for its name in the indicated record page.
+This process may require up to two page reads per request (one for the
+chapter index page and one for the request page). However, recently
+accessed pages are cached so that these page reads can be amortized across
+many block name requests.
+
+The volume index and the chapter indexes are implemented using a
+memory-efficient structure called a delta index. Instead of storing the
+entire block name (the key) for each entry, the entries are sorted by name
+and only the difference between adjacent keys (the delta) is stored.
+Because we expect the hashes to be randomly distributed, the size of the
+deltas follows an exponential distribution. Because of this distribution,
+the deltas are expressed using a Huffman code to take up even less space.
+The entire sorted list of keys is called a delta list. This structure
+allows the index to use many fewer bytes per entry than a traditional hash
+table, but it is slightly more expensive to look up entries, because a
+request must read every entry in a delta list to add up the deltas in order
+to find the record it needs. The delta index reduces this lookup cost by
+splitting its key space into many sub-lists, each starting at a fixed key
+value, so that each individual list is short.
+
+The default index size can hold 64 million records, corresponding to about
+256GB of data. This means that the index can identify duplicate data if the
+original data was written within the last 256GB of writes. This range is
+called the deduplication window. If new writes duplicate data that is older
+than that, the index will not be able to find it because the records of the
+older data have been removed. This means that if an application writes a
+200 GB file to a vdo target and then immediately writes it again, the two
+copies will deduplicate perfectly. Doing the same with a 500 GB file will
+result in no deduplication, because the beginning of the file will no
+longer be in the index by the time the second write begins (assuming there
+is no duplication within the file itself).
+
+If an application anticipates a data workload that will see useful
+deduplication beyond the 256GB threshold, vdo can be configured to use a
+larger index with a correspondingly larger deduplication window. (This
+configuration can only be set when the target is created, not altered
+later. It is important to consider the expected workload for a vdo target
+before configuring it.) There are two ways to do this.
+
+One way is to increase the memory size of the index, which also increases
+the amount of backing storage required. Doubling the size of the index will
+double the length of the deduplication window at the expense of doubling
+the storage size and the memory requirements.
+
+The other option is to enable sparse indexing. Sparse indexing increases
+the deduplication window by a factor of 10, at the expense of also
+increasing the storage size by a factor of 10. However with sparse
+indexing, the memory requirements do not increase. The trade-off is
+slightly more computation per request and a slight decrease in the amount
+of deduplication detected. For most workloads with significant amounts of
+duplicate data, sparse indexing will detect 97-99% of the deduplication
+that a standard index will detect.
+
+The vio and data_vio Structures
+-------------------------------
+
+A vio (short for Vdo I/O) is conceptually similar to a bio, with additional
+fields and data to track vdo-specific information. A struct vio maintains a
+pointer to a bio but also tracks other fields specific to the operation of
+vdo. The vio is kept separate from its related bio because there are many
+circumstances where vdo completes the bio but must continue to do work
+related to deduplication or compression.
+
+Metadata reads and writes, and other writes that originate within vdo, use
+a struct vio directly. Application reads and writes use a larger structure
+called a data_vio to track information about their progress. A struct
+data_vio contain a struct vio and also includes several other fields
+related to deduplication and other vdo features. The data_vio is the
+primary unit of application work in vdo. Each data_vio proceeds through a
+set of steps to handle the application data, after which it is reset and
+returned to a pool of data_vios for reuse.
+
+There is a fixed pool of 2048 data_vios. This number was chosen to bound
+the amount of work that is required to recover from a crash. In addition,
+benchmarks have indicated that increasing the size of the pool does not
+significantly improve performance.
+
+The Data Store
+--------------
+
+The data store is implemented by three main data structures, all of which
+work in concert to reduce or amortize metadata updates across as many data
+writes as possible.
+
+*The Slab Depot*
+
+Most of the vdo volume belongs to the slab depot. The depot contains a
+collection of slabs. The slabs can be up to 32GB, and are divided into
+three sections. Most of a slab consists of a linear sequence of 4K blocks.
+These blocks are used either to store data, or to hold portions of the
+block map (see below). In addition to the data blocks, each slab has a set
+of reference counters, using 1 byte for each data block. Finally each slab
+has a journal.
+
+Reference updates are written to the slab journal. Slab journal blocks are
+written out either when they are full, or when the recovery journal
+requests they do so in order to allow the main recovery journal (see below)
+to free up space. The slab journal is used both to ensure that the main
+recovery journal can regularly free up space, and also to amortize the cost
+of updating individual reference blocks. The reference counters are kept in
+memory and are written out, a block at a time in oldest-dirtied-order, only
+when there is a need to reclaim slab journal space. The write operations
+are performed in the background as needed so they do not add latency to
+particular I/O operations.
+
+Each slab is independent of every other. They are assigned to "physical
+zones" in round-robin fashion. If there are P physical zones, then slab n
+is assigned to zone n mod P.
+
+The slab depot maintains an additional small data structure, the "slab
+summary," which is used to reduce the amount of work needed to come back
+online after a crash. The slab summary maintains an entry for each slab
+indicating whether or not the slab has ever been used, whether all of its
+reference count updates have been persisted to storage, and approximately
+how full it is. During recovery, each physical zone will attempt to recover
+at least one slab, stopping whenever it has recovered a slab which has some
+free blocks. Once each zone has some space, or has determined that none is
+available, the target can resume normal operation in a degraded mode. Read
+and write requests can be serviced, perhaps with degraded performance,
+while the remainder of the dirty slabs are recovered.
+
+*The Block Map*
+
+The block map contains the logical to physical mapping. It can be thought
+of as an array with one entry per logical address. Each entry is 5 bytes,
+36 bits of which contain the physical block number which holds the data for
+the given logical address. The other 4 bits are used to indicate the nature
+of the mapping. Of the 16 possible states, one represents a logical address
+which is unmapped (i.e. it has never been written, or has been discarded),
+one represents an uncompressed block, and the other 14 states are used to
+indicate that the mapped data is compressed, and which of the compression
+slots in the compressed block contains the data for this logical address.
+
+In practice, the array of mapping entries is divided into "block map
+pages," each of which fits in a single 4K block. Each block map page
+consists of a header and 812 mapping entries. Each mapping page is actually
+a leaf of a radix tree which consists of block map pages at each level.
+There are 60 radix trees which are assigned to "logical zones" in round
+robin fashion. (If there are L logical zones, tree n will belong to zone n
+mod L.) At each level, the trees are interleaved, so logical addresses
+0-811 belong to tree 0, logical addresses 812-1623 belong to tree 1, and so
+on. The interleaving is maintained all the way up to the 60 root nodes.
+Choosing 60 trees results in an evenly distributed number of trees per zone
+for a large number of possible logical zone counts. The storage for the 60
+tree roots is allocated at format time. All other block map pages are
+allocated out of the slabs as needed. This flexible allocation avoids the
+need to pre-allocate space for the entire set of logical mappings and also
+makes growing the logical size of a vdo relatively easy.
+
+In operation, the block map maintains two caches. It is prohibitive to keep
+the entire leaf level of the trees in memory, so each logical zone
+maintains its own cache of leaf pages. The size of this cache is
+configurable at target start time. The second cache is allocated at start
+time, and is large enough to hold all the non-leaf pages of the entire
+block map. This cache is populated as pages are needed.
+
+*The Recovery Journal*
+
+The recovery journal is used to amortize updates across the block map and
+slab depot. Each write request causes an entry to be made in the journal.
+Entries are either "data remappings" or "block map remappings." For a data
+remapping, the journal records the logical address affected and its old and
+new physical mappings. For a block map remapping, the journal records the
+block map page number and the physical block allocated for it. Block map
+pages are never reclaimed or repurposed, so the old mapping is always 0.
+
+Each journal entry is an intent record summarizing the metadata updates
+that are required for a data_vio. The recovery journal issues a flush
+before each journal block write to ensure that the physical data for the
+new block mappings in that block are stable on storage, and journal block
+writes are all issued with the FUA bit set to ensure the recovery journal
+entries themselves are stable. The journal entry and the data write it
+represents must be stable on disk before the other metadata structures may
+be updated to reflect the operation. These entries allow the vdo device to
+reconstruct the logical to physical mappings after an unexpected
+interruption such as a loss of power.
+
+*Write Path*
+
+All write I/O to vdo is asynchronous. Each bio will be acknowledged as soon
+as vdo has done enough work to guarantee that it can complete the write
+eventually. Generally, the data for acknowledged but unflushed write I/O
+can be treated as though it is cached in memory. If an application
+requires data to be stable on storage, it must issue a flush or write the
+data with the FUA bit set like any other asynchronous I/O. Shutting down
+the vdo target will also flush any remaining I/O.
+
+Application write bios follow the steps outlined below.
+
+1. A data_vio is obtained from the data_vio pool and associated with the
+ application bio. If there are no data_vios available, the incoming bio
+ will block until a data_vio is available. This provides back pressure
+ to the application. The data_vio pool is protected by a spin lock.
+
+ The newly acquired data_vio is reset and the bio's data is copied into
+ the data_vio if it is a write and the data is not all zeroes. The data
+ must be copied because the application bio can be acknowledged before
+ the data_vio processing is complete, which means later processing steps
+ will no longer have access to the application bio. The application bio
+ may also be smaller than 4K, in which case the data_vio will have
+ already read the underlying block and the data is instead copied over
+ the relevant portion of the larger block.
+
+2. The data_vio places a claim (the "logical lock") on the logical address
+ of the bio. It is vital to prevent simultaneous modifications of the
+ same logical address, because deduplication involves sharing blocks.
+ This claim is implemented as an entry in a hashtable where the key is
+ the logical address and the value is a pointer to the data_vio
+ currently handling that address.
+
+ If a data_vio looks in the hashtable and finds that another data_vio is
+ already operating on that logical address, it waits until the previous
+ operation finishes. It also sends a message to inform the current
+ lock holder that it is waiting. Most notably, a new data_vio waiting
+ for a logical lock will flush the previous lock holder out of the
+ compression packer (step 8d) rather than allowing it to continue
+ waiting to be packed.
+
+ This stage requires the data_vio to get an implicit lock on the
+ appropriate logical zone to prevent concurrent modifications of the
+ hashtable. This implicit locking is handled by the zone divisions
+ described above.
+
+3. The data_vio traverses the block map tree to ensure that all the
+ necessary internal tree nodes have been allocated, by trying to find
+ the leaf page for its logical address. If any interior tree page is
+ missing, it is allocated at this time out of the same physical storage
+ pool used to store application data.
+
+ a. If any page-node in the tree has not yet been allocated, it must be
+ allocated before the write can continue. This step requires the
+ data_vio to lock the page-node that needs to be allocated. This
+ lock, like the logical block lock in step 2, is a hashtable entry
+ that causes other data_vios to wait for the allocation process to
+ complete.
+
+ The implicit logical zone lock is released while the allocation is
+ happening, in order to allow other operations in the same logical
+ zone to proceed. The details of allocation are the same as in
+ step 4. Once a new node has been allocated, that node is added to
+ the tree using a similar process to adding a new data block mapping.
+ The data_vio journals the intent to add the new node to the block
+ map tree (step 10), updates the reference count of the new block
+ (step 11), and reacquires the implicit logical zone lock to add the
+ new mapping to the parent tree node (step 12). Once the tree is
+ updated, the data_vio proceeds down the tree. Any other data_vios
+ waiting on this allocation also proceed.
+
+ b. In the steady-state case, the block map tree nodes will already be
+ allocated, so the data_vio just traverses the tree until it finds
+ the required leaf node. The location of the mapping (the "block map
+ slot") is recorded in the data_vio so that later steps do not need
+ to traverse the tree again. The data_vio then releases the implicit
+ logical zone lock.
+
+4. If the block is a zero block, skip to step 9. Otherwise, an attempt is
+ made to allocate a free data block. This allocation ensures that the
+ data_vio can write its data somewhere even if deduplication and
+ compression are not possible. This stage gets an implicit lock on a
+ physical zone to search for free space within that zone.
+
+ The data_vio will search each slab in a zone until it finds a free
+ block or decides there are none. If the first zone has no free space,
+ it will proceed to search the next physical zone by taking the implicit
+ lock for that zone and releasing the previous one until it finds a
+ free block or runs out of zones to search. The data_vio will acquire a
+ struct pbn_lock (the "physical block lock") on the free block. The
+ struct pbn_lock also has several fields to record the various kinds of
+ claims that data_vios can have on physical blocks. The pbn_lock is
+ added to a hashtable like the logical block locks in step 2. This
+ hashtable is also covered by the implicit physical zone lock. The
+ reference count of the free block is updated to prevent any other
+ data_vio from considering it free. The reference counters are a
+ sub-component of the slab and are thus also covered by the implicit
+ physical zone lock.
+
+5. If an allocation was obtained, the data_vio has all the resources it
+ needs to complete the write. The application bio can safely be
+ acknowledged at this point. The acknowledgment happens on a separate
+ thread to prevent the application callback from blocking other data_vio
+ operations.
+
+ If an allocation could not be obtained, the data_vio continues to
+ attempt to deduplicate or compress the data, but the bio is not
+ acknowledged because the vdo device may be out of space.
+
+6. At this point vdo must determine where to store the application data.
+ The data_vio's data is hashed and the hash (the "record name") is
+ recorded in the data_vio.
+
+7. The data_vio reserves or joins a struct hash_lock, which manages all of
+ the data_vios currently writing the same data. Active hash locks are
+ tracked in a hashtable similar to the way logical block locks are
+ tracked in step 2. This hashtable is covered by the implicit lock on
+ the hash zone.
+
+ If there is no existing hash lock for this data_vio's record_name, the
+ data_vio obtains a hash lock from the pool, adds it to the hashtable,
+ and sets itself as the new hash lock's "agent." The hash_lock pool is
+ also covered by the implicit hash zone lock. The hash lock agent will
+ do all the work to decide where the application data will be
+ written. If a hash lock for the data_vio's record_name already exists,
+ and the data_vio's data is the same as the agent's data, the new
+ data_vio will wait for the agent to complete its work and then share
+ its result.
+
+ In the rare case that a hash lock exists for the data_vio's hash but
+ the data does not match the hash lock's agent, the data_vio skips to
+ step 8h and attempts to write its data directly. This can happen if two
+ different data blocks produce the same hash, for example.
+
+8. The hash lock agent attempts to deduplicate or compress its data with
+ the following steps.
+
+ a. The agent initializes and sends its embedded deduplication request
+ (struct uds_request) to the deduplication index. This does not
+ require the data_vio to get any locks because the index components
+ manage their own locking. The data_vio waits until it either gets a
+ response from the index or times out.
+
+ b. If the deduplication index returns advice, the data_vio attempts to
+ obtain a physical block lock on the indicated physical address, in
+ order to read the data and verify that it is the same as the
+ data_vio's data, and that it can accept more references. If the
+ physical address is already locked by another data_vio, the data at
+ that address may soon be overwritten so it is not safe to use the
+ address for deduplication.
+
+ c. If the data matches and the physical block can add references, the
+ agent and any other data_vios waiting on it will record this
+ physical block as their new physical address and proceed to step 9
+ to record their new mapping. If there are more data_vios in the hash
+ lock than there are references available, one of the remaining
+ data_vios becomes the new agent and continues to step 8d as if no
+ valid advice was returned.
+
+ d. If no usable duplicate block was found, the agent first checks that
+ it has an allocated physical block (from step 3) that it can write
+ to. If the agent does not have an allocation, some other data_vio in
+ the hash lock that does have an allocation takes over as agent. If
+ none of the data_vios have an allocated physical block, these writes
+ are out of space, so they proceed to step 13 for cleanup.
+
+ e. The agent attempts to compress its data. If the data does not
+ compress, the data_vio will continue to step 8h to write its data
+ directly.
+
+ If the compressed size is small enough, the agent will release the
+ implicit hash zone lock and go to the packer (struct packer) where
+ it will be placed in a bin (struct packer_bin) along with other
+ data_vios. All compression operations require the implicit lock on
+ the packer zone.
+
+ The packer can combine up to 14 compressed blocks in a single 4k
+ data block. Compression is only helpful if vdo can pack at least 2
+ data_vios into a single data block. This means that a data_vio may
+ wait in the packer for an arbitrarily long time for other data_vios
+ to fill out the compressed block. There is a mechanism for vdo to
+ evict waiting data_vios when continuing to wait would cause
+ problems. Circumstances causing an eviction include an application
+ flush, device shutdown, or a subsequent data_vio trying to overwrite
+ the same logical block address. A data_vio may also be evicted from
+ the packer if it cannot be paired with any other compressed block
+ before more compressible blocks need to use its bin. An evicted
+ data_vio will proceed to step 8h to write its data directly.
+
+ f. If the agent fills a packer bin, either because all 14 of its slots
+ are used or because it has no remaining space, it is written out
+ using the allocated physical block from one of its data_vios. Step
+ 8d has already ensured that an allocation is available.
+
+ g. Each data_vio sets the compressed block as its new physical address.
+ The data_vio obtains an implicit lock on the physical zone and
+ acquires the struct pbn_lock for the compressed block, which is
+ modified to be a shared lock. Then it releases the implicit physical
+ zone lock and proceeds to step 8i.
+
+ h. Any data_vio evicted from the packer will have an allocation from
+ step 3. It will write its data to that allocated physical block.
+
+ i. After the data is written, if the data_vio is the agent of a hash
+ lock, it will reacquire the implicit hash zone lock and share its
+ physical address with as many other data_vios in the hash lock as
+ possible. Each data_vio will then proceed to step 9 to record its
+ new mapping.
+
+ j. If the agent actually wrote new data (whether compressed or not),
+ the deduplication index is updated to reflect the location of the
+ new data. The agent then releases the implicit hash zone lock.
+
+9. The data_vio determines the previous mapping of the logical address.
+ There is a cache for block map leaf pages (the "block map cache"),
+ because there are usually too many block map leaf nodes to store
+ entirely in memory. If the desired leaf page is not in the cache, the
+ data_vio will reserve a slot in the cache and load the desired page
+ into it, possibly evicting an older cached page. The data_vio then
+ finds the current physical address for this logical address (the "old
+ physical mapping"), if any, and records it. This step requires a lock
+ on the block map cache structures, covered by the implicit logical zone
+ lock.
+
+10. The data_vio makes an entry in the recovery journal containing the
+ logical block address, the old physical mapping, and the new physical
+ mapping. Making this journal entry requires holding the implicit
+ recovery journal lock. The data_vio will wait in the journal until all
+ recovery blocks up to the one containing its entry have been written
+ and flushed to ensure the transaction is stable on storage.
+
+11. Once the recovery journal entry is stable, the data_vio makes two slab
+ journal entries: an increment entry for the new mapping, and a
+ decrement entry for the old mapping. These two operations each require
+ holding a lock on the affected physical slab, covered by its implicit
+ physical zone lock. For correctness during recovery, the slab journal
+ entries in any given slab journal must be in the same order as the
+ corresponding recovery journal entries. Therefore, if the two entries
+ are in different zones, they are made concurrently, and if they are in
+ the same zone, the increment is always made before the decrement in
+ order to avoid underflow. After each slab journal entry is made in
+ memory, the associated reference count is also updated in memory.
+
+12. Once both of the reference count updates are done, the data_vio
+ acquires the implicit logical zone lock and updates the
+ logical-to-physical mapping in the block map to point to the new
+ physical block. At this point the write operation is complete.
+
+13. If the data_vio has a hash lock, it acquires the implicit hash zone
+ lock and releases its hash lock to the pool.
+
+ The data_vio then acquires the implicit physical zone lock and releases
+ the struct pbn_lock it holds for its allocated block. If it had an
+ allocation that it did not use, it also sets the reference count for
+ that block back to zero to free it for use by subsequent data_vios.
+
+ The data_vio then acquires the implicit logical zone lock and releases
+ the logical block lock acquired in step 2.
+
+ The application bio is then acknowledged if it has not previously been
+ acknowledged, and the data_vio is returned to the pool.
+
+*Read Path*
+
+An application read bio follows a much simpler set of steps. It does steps
+1 and 2 in the write path to obtain a data_vio and lock its logical
+address. If there is already a write data_vio in progress for that logical
+address that is guaranteed to complete, the read data_vio will copy the
+data from the write data_vio and return it. Otherwise, it will look up the
+logical-to-physical mapping by traversing the block map tree as in step 3,
+and then read and possibly decompress the indicated data at the indicated
+physical block address. A read data_vio will not allocate block map tree
+nodes if they are missing. If the interior block map nodes do not exist
+yet, the logical block map address must still be unmapped and the read
+data_vio will return all zeroes. A read data_vio handles cleanup and
+acknowledgment as in step 13, although it only needs to release the logical
+lock and return itself to the pool.
+
+*Small Writes*
+
+All storage within vdo is managed as 4KB blocks, but it can accept writes
+as small as 512 bytes. Processing a write that is smaller than 4K requires
+a read-modify-write operation that reads the relevant 4K block, copies the
+new data over the approriate sectors of the block, and then launches a
+write operation for the modified data block. The read and write stages of
+this operation are nearly identical to the normal read and write
+operations, and a single data_vio is used throughout this operation.
+
+*Recovery*
+
+When a vdo is restarted after a crash, it will attempt to recover from the
+recovery journal. During the pre-resume phase of the next start, the
+recovery journal is read. The increment portion of valid entries are played
+into the block map. Next, valid entries are played, in order as required,
+into the slab journals. Finally, each physical zone attempts to replay at
+least one slab journal to reconstruct the reference counts of one slab.
+Once each zone has some free space (or has determined that it has none),
+the vdo comes back online, while the remainder of the slab journals are
+used to reconstruct the rest of the reference counts in the background.
+
+*Read-only Rebuild*
+
+If a vdo encounters an unrecoverable error, it will enter read-only mode.
+This mode indicates that some previously acknowledged data may have been
+lost. The vdo may be instructed to rebuild as best it can in order to
+return to a writable state. However, this is never done automatically due
+to the possibility that data has been lost. During a read-only rebuild, the
+block map is recovered from the recovery journal as before. However, the
+reference counts are not rebuilt from the slab journals. Instead, the
+reference counts are zeroed, the entire block map is traversed, and the
+reference counts are updated from the block mappings. While this may lose
+some data, it ensures that the block map and reference counts are
+consistent with each other. This allows vdo to resume normal operation and
+accept further writes.
diff --git a/Documentation/admin-guide/device-mapper/vdo.rst b/Documentation/admin-guide/device-mapper/vdo.rst
new file mode 100644
index 0000000..7e1ecaf
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/vdo.rst
@@ -0,0 +1,406 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+
+dm-vdo
+======
+
+The dm-vdo (virtual data optimizer) device mapper target provides
+block-level deduplication, compression, and thin provisioning. As a device
+mapper target, it can add these features to the storage stack, compatible
+with any file system. The vdo target does not protect against data
+corruption, relying instead on integrity protection of the storage below
+it. It is strongly recommended that lvm be used to manage vdo volumes. See
+lvmvdo(7).
+
+Userspace component
+===================
+
+Formatting a vdo volume requires the use of the 'vdoformat' tool, available
+at:
+
+https://github.com/dm-vdo/vdo/
+
+In most cases, a vdo target will recover from a crash automatically the
+next time it is started. In cases where it encountered an unrecoverable
+error (either during normal operation or crash recovery) the target will
+enter or come up in read-only mode. Because read-only mode is indicative of
+data-loss, a positive action must be taken to bring vdo out of read-only
+mode. The 'vdoforcerebuild' tool, available from the same repo, is used to
+prepare a read-only vdo to exit read-only mode. After running this tool,
+the vdo target will rebuild its metadata the next time it is
+started. Although some data may be lost, the rebuilt vdo's metadata will be
+internally consistent and the target will be writable again.
+
+The repo also contains additional userspace tools which can be used to
+inspect a vdo target's on-disk metadata. Fortunately, these tools are
+rarely needed except by dm-vdo developers.
+
+Metadata requirements
+=====================
+
+Each vdo volume reserves 3GB of space for metadata, or more depending on
+its configuration. It is helpful to check that the space saved by
+deduplication and compression is not cancelled out by the metadata
+requirements. An estimation of the space saved for a specific dataset can
+be computed with the vdo estimator tool, which is available at:
+
+https://github.com/dm-vdo/vdoestimator/
+
+Target interface
+================
+
+Table line
+----------
+
+::
+
+ <offset> <logical device size> vdo V4 <storage device>
+ <storage device size> <minimum I/O size> <block map cache size>
+ <block map era length> [optional arguments]
+
+
+Required parameters:
+
+ offset:
+ The offset, in sectors, at which the vdo volume's logical
+ space begins.
+
+ logical device size:
+ The size of the device which the vdo volume will service,
+ in sectors. Must match the current logical size of the vdo
+ volume.
+
+ storage device:
+ The device holding the vdo volume's data and metadata.
+
+ storage device size:
+ The size of the device holding the vdo volume, as a number
+ of 4096-byte blocks. Must match the current size of the vdo
+ volume.
+
+ minimum I/O size:
+ The minimum I/O size for this vdo volume to accept, in
+ bytes. Valid values are 512 or 4096. The recommended value
+ is 4096.
+
+ block map cache size:
+ The size of the block map cache, as a number of 4096-byte
+ blocks. The minimum and recommended value is 32768 blocks.
+ If the logical thread count is non-zero, the cache size
+ must be at least 4096 blocks per logical thread.
+
+ block map era length:
+ The speed with which the block map cache writes out
+ modified block map pages. A smaller era length is likely to
+ reduce the amount of time spent rebuilding, at the cost of
+ increased block map writes during normal operation. The
+ maximum and recommended value is 16380; the minimum value
+ is 1.
+
+Optional parameters:
+--------------------
+Some or all of these parameters may be specified as <key> <value> pairs.
+
+Thread related parameters:
+
+Different categories of work are assigned to separate thread groups, and
+the number of threads in each group can be configured separately.
+
+If <hash>, <logical>, and <physical> are all set to 0, the work handled by
+all three thread types will be handled by a single thread. If any of these
+values are non-zero, all of them must be non-zero.
+
+ ack:
+ The number of threads used to complete bios. Since
+ completing a bio calls an arbitrary completion function
+ outside the vdo volume, threads of this type allow the vdo
+ volume to continue processing requests even when bio
+ completion is slow. The default is 1.
+
+ bio:
+ The number of threads used to issue bios to the underlying
+ storage. Threads of this type allow the vdo volume to
+ continue processing requests even when bio submission is
+ slow. The default is 4.
+
+ bioRotationInterval:
+ The number of bios to enqueue on each bio thread before
+ switching to the next thread. The value must be greater
+ than 0 and not more than 1024; the default is 64.
+
+ cpu:
+ The number of threads used to do CPU-intensive work, such
+ as hashing and compression. The default is 1.
+
+ hash:
+ The number of threads used to manage data comparisons for
+ deduplication based on the hash value of data blocks. The
+ default is 0.
+
+ logical:
+ The number of threads used to manage caching and locking
+ based on the logical address of incoming bios. The default
+ is 0; the maximum is 60.
+
+ physical:
+ The number of threads used to manage administration of the
+ underlying storage device. At format time, a slab size for
+ the vdo is chosen; the vdo storage device must be large
+ enough to have at least 1 slab per physical thread. The
+ default is 0; the maximum is 16.
+
+Miscellaneous parameters:
+
+ maxDiscard:
+ The maximum size of discard bio accepted, in 4096-byte
+ blocks. I/O requests to a vdo volume are normally split
+ into 4096-byte blocks, and processed up to 2048 at a time.
+ However, discard requests to a vdo volume can be
+ automatically split to a larger size, up to <maxDiscard>
+ 4096-byte blocks in a single bio, and are limited to 1500
+ at a time. Increasing this value may provide better overall
+ performance, at the cost of increased latency for the
+ individual discard requests. The default and minimum is 1;
+ the maximum is UINT_MAX / 4096.
+
+ deduplication:
+ Whether deduplication is enabled. The default is 'on'; the
+ acceptable values are 'on' and 'off'.
+
+ compression:
+ Whether compression is enabled. The default is 'off'; the
+ acceptable values are 'on' and 'off'.
+
+Device modification
+-------------------
+
+A modified table may be loaded into a running, non-suspended vdo volume.
+The modifications will take effect when the device is next resumed. The
+modifiable parameters are <logical device size>, <physical device size>,
+<maxDiscard>, <compression>, and <deduplication>.
+
+If the logical device size or physical device size are changed, upon
+successful resume vdo will store the new values and require them on future
+startups. These two parameters may not be decreased. The logical device
+size may not exceed 4 PB. The physical device size must increase by at
+least 32832 4096-byte blocks if at all, and must not exceed the size of the
+underlying storage device. Additionally, when formatting the vdo device, a
+slab size is chosen: the physical device size may never increase above the
+size which provides 8192 slabs, and each increase must be large enough to
+add at least one new slab.
+
+Examples:
+
+Start a previously-formatted vdo volume with 1 GB logical space and 1 GB
+physical space, storing to /dev/dm-1 which has more than 1 GB of space.
+
+::
+
+ dmsetup create vdo0 --table \
+ "0 2097152 vdo V4 /dev/dm-1 262144 4096 32768 16380"
+
+Grow the logical size to 4 GB.
+
+::
+
+ dmsetup reload vdo0 --table \
+ "0 8388608 vdo V4 /dev/dm-1 262144 4096 32768 16380"
+ dmsetup resume vdo0
+
+Grow the physical size to 2 GB.
+
+::
+
+ dmsetup reload vdo0 --table \
+ "0 8388608 vdo V4 /dev/dm-1 524288 4096 32768 16380"
+ dmsetup resume vdo0
+
+Grow the physical size by 1 GB more and increase max discard sectors.
+
+::
+
+ dmsetup reload vdo0 --table \
+ "0 10485760 vdo V4 /dev/dm-1 786432 4096 32768 16380 maxDiscard 8"
+ dmsetup resume vdo0
+
+Stop the vdo volume.
+
+::
+
+ dmsetup remove vdo0
+
+Start the vdo volume again. Note that the logical and physical device sizes
+must still match, but other parameters can change.
+
+::
+
+ dmsetup create vdo1 --table \
+ "0 10485760 vdo V4 /dev/dm-1 786432 512 65550 5000 hash 1 logical 3 physical 2"
+
+Messages
+--------
+All vdo devices accept messages in the form:
+
+::
+ dmsetup message <target-name> 0 <message-name> <message-parameters>
+
+The messages are:
+
+ stats:
+ Outputs the current view of the vdo statistics. Mostly used
+ by the vdostats userspace program to interpret the output
+ buffer.
+
+ dump:
+ Dumps many internal structures to the system log. This is
+ not always safe to run, so it should only be used to debug
+ a hung vdo. Optional parameters to specify structures to
+ dump are:
+
+ viopool: The pool of I/O requests incoming bios
+ pools: A synonym of 'viopool'
+ vdo: Most of the structures managing on-disk data
+ queues: Basic information about each vdo thread
+ threads: A synonym of 'queues'
+ default: Equivalent to 'queues vdo'
+ all: All of the above.
+
+ dump-on-shutdown:
+ Perform a default dump next time vdo shuts down.
+
+
+Status
+------
+
+::
+
+ <device> <operating mode> <in recovery> <index state>
+ <compression state> <physical blocks used> <total physical blocks>
+
+ device:
+ The name of the vdo volume.
+
+ operating mode:
+ The current operating mode of the vdo volume; values may be
+ 'normal', 'recovering' (the volume has detected an issue
+ with its metadata and is attempting to repair itself), and
+ 'read-only' (an error has occurred that forces the vdo
+ volume to only support read operations and not writes).
+
+ in recovery:
+ Whether the vdo volume is currently in recovery mode;
+ values may be 'recovering' or '-' which indicates not
+ recovering.
+
+ index state:
+ The current state of the deduplication index in the vdo
+ volume; values may be 'closed', 'closing', 'error',
+ 'offline', 'online', 'opening', and 'unknown'.
+
+ compression state:
+ The current state of compression in the vdo volume; values
+ may be 'offline' and 'online'.
+
+ used physical blocks:
+ The number of physical blocks in use by the vdo volume.
+
+ total physical blocks:
+ The total number of physical blocks the vdo volume may use;
+ the difference between this value and the
+ <used physical blocks> is the number of blocks the vdo
+ volume has left before being full.
+
+Memory Requirements
+===================
+
+A vdo target requires a fixed 38 MB of RAM along with the following amounts
+that scale with the target:
+
+- 1.15 MB of RAM for each 1 MB of configured block map cache size. The
+ block map cache requires a minimum of 150 MB.
+- 1.6 MB of RAM for each 1 TB of logical space.
+- 268 MB of RAM for each 1 TB of physical storage managed by the volume.
+
+The deduplication index requires additional memory which scales with the
+size of the deduplication window. For dense indexes, the index requires 1
+GB of RAM per 1 TB of window. For sparse indexes, the index requires 1 GB
+of RAM per 10 TB of window. The index configuration is set when the target
+is formatted and may not be modified.
+
+Module Parameters
+=================
+
+The vdo driver has a numeric parameter 'log_level' which controls the
+verbosity of logging from the driver. The default setting is 6
+(LOGLEVEL_INFO and more severe messages).
+
+Run-time Usage
+==============
+
+When using dm-vdo, it is important to be aware of the ways in which its
+behavior differs from other storage targets.
+
+- There is no guarantee that over-writes of existing blocks will succeed.
+ Because the underlying storage may be multiply referenced, over-writing
+ an existing block generally requires a vdo to have a free block
+ available.
+
+- When blocks are no longer in use, sending a discard request for those
+ blocks lets the vdo release references for those blocks. If the vdo is
+ thinly provisioned, discarding unused blocks is essential to prevent the
+ target from running out of space. However, due to the sharing of
+ duplicate blocks, no discard request for any given logical block is
+ guaranteed to reclaim space.
+
+- Assuming the underlying storage properly implements flush requests, vdo
+ is resilient against crashes, however, unflushed writes may or may not
+ persist after a crash.
+
+- Each write to a vdo target entails a significant amount of processing.
+ However, much of the work is paralellizable. Therefore, vdo targets
+ achieve better throughput at higher I/O depths, and can support up 2048
+ requests in parallel.
+
+Tuning
+======
+
+The vdo device has many options, and it can be difficult to make optimal
+choices without perfect knowledge of the workload. Additionally, most
+configuration options must be set when a vdo target is started, and cannot
+be changed without shutting it down completely; the configuration cannot be
+changed while the target is active. Ideally, tuning with simulated
+workloads should be performed before deploying vdo in production
+environments.
+
+The most important value to adjust is the block map cache size. In order to
+service a request for any logical address, a vdo must load the portion of
+the block map which holds the relevant mapping. These mappings are cached.
+Performance will suffer when the working set does not fit in the cache. By
+default, a vdo allocates 128 MB of metadata cache in RAM to support
+efficient access to 100 GB of logical space at a time. It should be scaled
+up proportionally for larger working sets.
+
+The logical and physical thread counts should also be adjusted. A logical
+thread controls a disjoint section of the block map, so additional logical
+threads increase parallelism and can increase throughput. Physical threads
+control a disjoint section of the data blocks, so additional physical
+threads can also increase throughput. However, excess threads can waste
+resources and increase contention.
+
+Bio submission threads control the parallelism involved in sending I/O to
+the underlying storage; fewer threads mean there is more opportunity to
+reorder I/O requests for performance benefit, but also that each I/O
+request has to wait longer before being submitted.
+
+Bio acknowledgment threads are used for finishing I/O requests. This is
+done on dedicated threads since the amount of work required to execute a
+bio's callback can not be controlled by the vdo itself. Usually one thread
+is sufficient but additional threads may be beneficial, particularly when
+bios have CPU-heavy callbacks.
+
+CPU threads are used for hashing and for compression; in workloads with
+compression enabled, more threads may result in higher throughput.
+
+Hash threads are used to sort active requests by hash and determine whether
+they should deduplicate; the most CPU intensive actions done by these
+threads are comparison of 4096-byte data blocks. In most cases, a single
+hash thread is sufficient.
diff --git a/Documentation/admin-guide/edid.rst b/Documentation/admin-guide/edid.rst
index 80deeb2..1a9b965 100644
--- a/Documentation/admin-guide/edid.rst
+++ b/Documentation/admin-guide/edid.rst
@@ -24,37 +24,4 @@
As a remedy for such situations, the kernel configuration item
CONFIG_DRM_LOAD_EDID_FIRMWARE was introduced. It allows to provide an
individually prepared or corrected EDID data set in the /lib/firmware
-directory from where it is loaded via the firmware interface. The code
-(see drivers/gpu/drm/drm_edid_load.c) contains built-in data sets for
-commonly used screen resolutions (800x600, 1024x768, 1280x1024, 1600x1200,
-1680x1050, 1920x1080) as binary blobs, but the kernel source tree does
-not contain code to create these data. In order to elucidate the origin
-of the built-in binary EDID blobs and to facilitate the creation of
-individual data for a specific misbehaving monitor, commented sources
-and a Makefile environment are given here.
-
-To create binary EDID and C source code files from the existing data
-material, simply type "make" in tools/edid/.
-
-If you want to create your own EDID file, copy the file 1024x768.S,
-replace the settings with your own data and add a new target to the
-Makefile. Please note that the EDID data structure expects the timing
-values in a different way as compared to the standard X11 format.
-
-X11:
- HTimings:
- hdisp hsyncstart hsyncend htotal
- VTimings:
- vdisp vsyncstart vsyncend vtotal
-
-EDID::
-
- #define XPIX hdisp
- #define XBLANK htotal-hdisp
- #define XOFFSET hsyncstart-hdisp
- #define XPULSE hsyncend-hsyncstart
-
- #define YPIX vdisp
- #define YBLANK vtotal-vdisp
- #define YOFFSET vsyncstart-vdisp
- #define YPULSE vsyncend-vsyncstart
+directory from where it is loaded via the firmware interface.
diff --git a/Documentation/admin-guide/gpio/gpio-mockup.rst b/Documentation/admin-guide/gpio/gpio-mockup.rst
index 493071d..d6e7438 100644
--- a/Documentation/admin-guide/gpio/gpio-mockup.rst
+++ b/Documentation/admin-guide/gpio/gpio-mockup.rst
@@ -3,6 +3,14 @@
GPIO Testing Driver
===================
+.. note::
+
+ This module has been obsoleted by the more flexible gpio-sim.rst.
+ New developments should use that API and existing developments are
+ encouraged to migrate as soon as possible.
+ This module will continue to be maintained but no new features will be
+ added.
+
The GPIO Testing Driver (gpio-mockup) provides a way to create simulated GPIO
chips for testing purposes. The lines exposed by these chips can be accessed
using the standard GPIO character device interface as well as manipulated
diff --git a/Documentation/admin-guide/gpio/index.rst b/Documentation/admin-guide/gpio/index.rst
index f6861ca..460afd2 100644
--- a/Documentation/admin-guide/gpio/index.rst
+++ b/Documentation/admin-guide/gpio/index.rst
@@ -1,16 +1,16 @@
.. SPDX-License-Identifier: GPL-2.0
====
-gpio
+GPIO
====
.. toctree::
:maxdepth: 1
+ Character Device Userspace API <../../userspace-api/gpio/chardev>
gpio-aggregator
- sysfs
- gpio-mockup
gpio-sim
+ Obsolete APIs <obsolete>
.. only:: subproject and html
diff --git a/Documentation/admin-guide/gpio/obsolete.rst b/Documentation/admin-guide/gpio/obsolete.rst
new file mode 100644
index 0000000..5adbff0
--- /dev/null
+++ b/Documentation/admin-guide/gpio/obsolete.rst
@@ -0,0 +1,13 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==================
+Obsolete GPIO APIs
+==================
+
+.. toctree::
+ :maxdepth: 1
+
+ Character Device Userspace API (v1) <../../userspace-api/gpio/chardev_v1>
+ Sysfs Interface <../../userspace-api/gpio/sysfs>
+ Mockup Testing Module <gpio-mockup>
+
diff --git a/Documentation/admin-guide/gpio/sysfs.rst b/Documentation/admin-guide/gpio/sysfs.rst
deleted file mode 100644
index 35171d1..0000000
--- a/Documentation/admin-guide/gpio/sysfs.rst
+++ /dev/null
@@ -1,167 +0,0 @@
-GPIO Sysfs Interface for Userspace
-==================================
-
-.. warning::
-
- THIS ABI IS DEPRECATED, THE ABI DOCUMENTATION HAS BEEN MOVED TO
- Documentation/ABI/obsolete/sysfs-gpio AND NEW USERSPACE CONSUMERS
- ARE SUPPOSED TO USE THE CHARACTER DEVICE ABI. THIS OLD SYSFS ABI WILL
- NOT BE DEVELOPED (NO NEW FEATURES), IT WILL JUST BE MAINTAINED.
-
-Refer to the examples in tools/gpio/* for an introduction to the new
-character device ABI. Also see the userspace header in
-include/uapi/linux/gpio.h
-
-The deprecated sysfs ABI
-------------------------
-Platforms which use the "gpiolib" implementors framework may choose to
-configure a sysfs user interface to GPIOs. This is different from the
-debugfs interface, since it provides control over GPIO direction and
-value instead of just showing a gpio state summary. Plus, it could be
-present on production systems without debugging support.
-
-Given appropriate hardware documentation for the system, userspace could
-know for example that GPIO #23 controls the write protect line used to
-protect boot loader segments in flash memory. System upgrade procedures
-may need to temporarily remove that protection, first importing a GPIO,
-then changing its output state, then updating the code before re-enabling
-the write protection. In normal use, GPIO #23 would never be touched,
-and the kernel would have no need to know about it.
-
-Again depending on appropriate hardware documentation, on some systems
-userspace GPIO can be used to determine system configuration data that
-standard kernels won't know about. And for some tasks, simple userspace
-GPIO drivers could be all that the system really needs.
-
-DO NOT ABUSE SYSFS TO CONTROL HARDWARE THAT HAS PROPER KERNEL DRIVERS.
-PLEASE READ THE DOCUMENT AT Documentation/driver-api/gpio/drivers-on-gpio.rst
-TO AVOID REINVENTING KERNEL WHEELS IN USERSPACE. I MEAN IT. REALLY.
-
-Paths in Sysfs
---------------
-There are three kinds of entries in /sys/class/gpio:
-
- - Control interfaces used to get userspace control over GPIOs;
-
- - GPIOs themselves; and
-
- - GPIO controllers ("gpio_chip" instances).
-
-That's in addition to standard files including the "device" symlink.
-
-The control interfaces are write-only:
-
- /sys/class/gpio/
-
- "export" ...
- Userspace may ask the kernel to export control of
- a GPIO to userspace by writing its number to this file.
-
- Example: "echo 19 > export" will create a "gpio19" node
- for GPIO #19, if that's not requested by kernel code.
-
- "unexport" ...
- Reverses the effect of exporting to userspace.
-
- Example: "echo 19 > unexport" will remove a "gpio19"
- node exported using the "export" file.
-
-GPIO signals have paths like /sys/class/gpio/gpio42/ (for GPIO #42)
-and have the following read/write attributes:
-
- /sys/class/gpio/gpioN/
-
- "direction" ...
- reads as either "in" or "out". This value may
- normally be written. Writing as "out" defaults to
- initializing the value as low. To ensure glitch free
- operation, values "low" and "high" may be written to
- configure the GPIO as an output with that initial value.
-
- Note that this attribute *will not exist* if the kernel
- doesn't support changing the direction of a GPIO, or
- it was exported by kernel code that didn't explicitly
- allow userspace to reconfigure this GPIO's direction.
-
- "value" ...
- reads as either 0 (low) or 1 (high). If the GPIO
- is configured as an output, this value may be written;
- any nonzero value is treated as high.
-
- If the pin can be configured as interrupt-generating interrupt
- and if it has been configured to generate interrupts (see the
- description of "edge"), you can poll(2) on that file and
- poll(2) will return whenever the interrupt was triggered. If
- you use poll(2), set the events POLLPRI and POLLERR. If you
- use select(2), set the file descriptor in exceptfds. After
- poll(2) returns, either lseek(2) to the beginning of the sysfs
- file and read the new value or close the file and re-open it
- to read the value.
-
- "edge" ...
- reads as either "none", "rising", "falling", or
- "both". Write these strings to select the signal edge(s)
- that will make poll(2) on the "value" file return.
-
- This file exists only if the pin can be configured as an
- interrupt generating input pin.
-
- "active_low" ...
- reads as either 0 (false) or 1 (true). Write
- any nonzero value to invert the value attribute both
- for reading and writing. Existing and subsequent
- poll(2) support configuration via the edge attribute
- for "rising" and "falling" edges will follow this
- setting.
-
-GPIO controllers have paths like /sys/class/gpio/gpiochip42/ (for the
-controller implementing GPIOs starting at #42) and have the following
-read-only attributes:
-
- /sys/class/gpio/gpiochipN/
-
- "base" ...
- same as N, the first GPIO managed by this chip
-
- "label" ...
- provided for diagnostics (not always unique)
-
- "ngpio" ...
- how many GPIOs this manages (N to N + ngpio - 1)
-
-Board documentation should in most cases cover what GPIOs are used for
-what purposes. However, those numbers are not always stable; GPIOs on
-a daughtercard might be different depending on the base board being used,
-or other cards in the stack. In such cases, you may need to use the
-gpiochip nodes (possibly in conjunction with schematics) to determine
-the correct GPIO number to use for a given signal.
-
-
-Exporting from Kernel code
---------------------------
-Kernel code can explicitly manage exports of GPIOs which have already been
-requested using gpio_request()::
-
- /* export the GPIO to userspace */
- int gpiod_export(struct gpio_desc *desc, bool direction_may_change);
-
- /* reverse gpiod_export() */
- void gpiod_unexport(struct gpio_desc *desc);
-
- /* create a sysfs link to an exported GPIO node */
- int gpiod_export_link(struct device *dev, const char *name,
- struct gpio_desc *desc);
-
-After a kernel driver requests a GPIO, it may only be made available in
-the sysfs interface by gpiod_export(). The driver can control whether the
-signal direction may change. This helps drivers prevent userspace code
-from accidentally clobbering important system state.
-
-This explicit exporting can help with debugging (by making some kinds
-of experiments easier), or can provide an always-there interface that's
-suitable for documenting as part of a board support package.
-
-After the GPIO has been exported, gpiod_export_link() allows creating
-symlinks from elsewhere in sysfs to the GPIO sysfs node. Drivers can
-use this to provide the interface under their own device in sysfs with
-a descriptive name.
diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst
index de99caa..ff0b440 100644
--- a/Documentation/admin-guide/hw-vuln/index.rst
+++ b/Documentation/admin-guide/hw-vuln/index.rst
@@ -21,3 +21,4 @@
cross-thread-rsb
srso
gather_data_sampling
+ reg-file-data-sampling
diff --git a/Documentation/admin-guide/hw-vuln/reg-file-data-sampling.rst b/Documentation/admin-guide/hw-vuln/reg-file-data-sampling.rst
new file mode 100644
index 0000000..0585d02
--- /dev/null
+++ b/Documentation/admin-guide/hw-vuln/reg-file-data-sampling.rst
@@ -0,0 +1,104 @@
+==================================
+Register File Data Sampling (RFDS)
+==================================
+
+Register File Data Sampling (RFDS) is a microarchitectural vulnerability that
+only affects Intel Atom parts(also branded as E-cores). RFDS may allow
+a malicious actor to infer data values previously used in floating point
+registers, vector registers, or integer registers. RFDS does not provide the
+ability to choose which data is inferred. CVE-2023-28746 is assigned to RFDS.
+
+Affected Processors
+===================
+Below is the list of affected Intel processors [#f1]_:
+
+ =================== ============
+ Common name Family_Model
+ =================== ============
+ ATOM_GOLDMONT 06_5CH
+ ATOM_GOLDMONT_D 06_5FH
+ ATOM_GOLDMONT_PLUS 06_7AH
+ ATOM_TREMONT_D 06_86H
+ ATOM_TREMONT 06_96H
+ ALDERLAKE 06_97H
+ ALDERLAKE_L 06_9AH
+ ATOM_TREMONT_L 06_9CH
+ RAPTORLAKE 06_B7H
+ RAPTORLAKE_P 06_BAH
+ ATOM_GRACEMONT 06_BEH
+ RAPTORLAKE_S 06_BFH
+ =================== ============
+
+As an exception to this table, Intel Xeon E family parts ALDERLAKE(06_97H) and
+RAPTORLAKE(06_B7H) codenamed Catlow are not affected. They are reported as
+vulnerable in Linux because they share the same family/model with an affected
+part. Unlike their affected counterparts, they do not enumerate RFDS_CLEAR or
+CPUID.HYBRID. This information could be used to distinguish between the
+affected and unaffected parts, but it is deemed not worth adding complexity as
+the reporting is fixed automatically when these parts enumerate RFDS_NO.
+
+Mitigation
+==========
+Intel released a microcode update that enables software to clear sensitive
+information using the VERW instruction. Like MDS, RFDS deploys the same
+mitigation strategy to force the CPU to clear the affected buffers before an
+attacker can extract the secrets. This is achieved by using the otherwise
+unused and obsolete VERW instruction in combination with a microcode update.
+The microcode clears the affected CPU buffers when the VERW instruction is
+executed.
+
+Mitigation points
+-----------------
+VERW is executed by the kernel before returning to user space, and by KVM
+before VMentry. None of the affected cores support SMT, so VERW is not required
+at C-state transitions.
+
+New bits in IA32_ARCH_CAPABILITIES
+----------------------------------
+Newer processors and microcode update on existing affected processors added new
+bits to IA32_ARCH_CAPABILITIES MSR. These bits can be used to enumerate
+vulnerability and mitigation capability:
+
+- Bit 27 - RFDS_NO - When set, processor is not affected by RFDS.
+- Bit 28 - RFDS_CLEAR - When set, processor is affected by RFDS, and has the
+ microcode that clears the affected buffers on VERW execution.
+
+Mitigation control on the kernel command line
+---------------------------------------------
+The kernel command line allows to control RFDS mitigation at boot time with the
+parameter "reg_file_data_sampling=". The valid arguments are:
+
+ ========== =================================================================
+ on If the CPU is vulnerable, enable mitigation; CPU buffer clearing
+ on exit to userspace and before entering a VM.
+ off Disables mitigation.
+ ========== =================================================================
+
+Mitigation default is selected by CONFIG_MITIGATION_RFDS.
+
+Mitigation status information
+-----------------------------
+The Linux kernel provides a sysfs interface to enumerate the current
+vulnerability status of the system: whether the system is vulnerable, and
+which mitigations are active. The relevant sysfs file is:
+
+ /sys/devices/system/cpu/vulnerabilities/reg_file_data_sampling
+
+The possible values in this file are:
+
+ .. list-table::
+
+ * - 'Not affected'
+ - The processor is not vulnerable
+ * - 'Vulnerable'
+ - The processor is vulnerable, but no mitigation enabled
+ * - 'Vulnerable: No microcode'
+ - The processor is vulnerable but microcode is not updated.
+ * - 'Mitigation: Clear Register File'
+ - The processor is vulnerable and the CPU buffer clearing mitigation is
+ enabled.
+
+References
+----------
+.. [#f1] Affected Processors
+ https://www.intel.com/content/www/us/en/developer/topic-technology/software-security-guidance/processors-affected-consolidated-product-cpu-model.html
diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst
index 32a8893..cce768a 100644
--- a/Documentation/admin-guide/hw-vuln/spectre.rst
+++ b/Documentation/admin-guide/hw-vuln/spectre.rst
@@ -473,8 +473,8 @@
-mindirect-branch=thunk-extern -mindirect-branch-register options.
If the kernel is compiled with a Clang compiler, the compiler needs
to support -mretpoline-external-thunk option. The kernel config
- CONFIG_RETPOLINE needs to be turned on, and the CPU needs to run with
- the latest updated microcode.
+ CONFIG_MITIGATION_RETPOLINE needs to be turned on, and the CPU needs
+ to run with the latest updated microcode.
On Intel Skylake-era systems the mitigation covers most, but not all,
cases. See :ref:`[3] <spec_ref3>` for more details.
@@ -609,8 +609,8 @@
Selecting 'on' will, and 'auto' may, choose a
mitigation method at run time according to the
CPU, the available microcode, the setting of the
- CONFIG_RETPOLINE configuration option, and the
- compiler with which the kernel was built.
+ CONFIG_MITIGATION_RETPOLINE configuration option,
+ and the compiler with which the kernel was built.
Selecting 'on' will also enable the mitigation
against user space to user space task attacks.
diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst
index fb40a1f..32ea52f 100644
--- a/Documentation/admin-guide/index.rst
+++ b/Documentation/admin-guide/index.rst
@@ -1,3 +1,4 @@
+=================================================
The Linux kernel user's and administrator's guide
=================================================
@@ -37,6 +38,7 @@
reporting-issues
reporting-regressions
quickly-build-trimmed-linux
+ verify-bugs-and-bisect-regressions
bug-hunting
bug-bisect
tainted-kernels
@@ -122,7 +124,7 @@
pmf
pnp
rapidio
- ras
+ RAS/index
rtc
serial-console
svga
diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst
index 5762e74..0302a93 100644
--- a/Documentation/admin-guide/kdump/kdump.rst
+++ b/Documentation/admin-guide/kdump/kdump.rst
@@ -191,9 +191,7 @@
CPU is enough for kdump kernel to dump vmcore on most of systems.
However, you can also specify nr_cpus=X to enable multiple processors
- in kdump kernel. In this case, "disable_cpu_apicid=" is needed to
- tell kdump kernel which cpu is 1st kernel's BSP. Please refer to
- admin-guide/kernel-parameters.txt for more details.
+ in kdump kernel.
With CONFIG_SMP=n, the above things are not related.
@@ -454,8 +452,7 @@
to use multi-thread programs with it, such as parallel dump feature of
makedumpfile. Otherwise, the multi-thread program may have a great
performance degradation. To enable multi-cpu support, you should bring up an
- SMP dump-capture kernel and specify maxcpus/nr_cpus, disable_cpu_apicid=[X]
- options while loading it.
+ SMP dump-capture kernel and specify maxcpus/nr_cpus options while loading it.
* For s390x there are two kdump modes: If a ELF header is specified with
the elfcorehdr= kernel parameter, it is used by the kdump kernel as it
diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst
index bced9e4..0f714fc 100644
--- a/Documentation/admin-guide/kdump/vmcoreinfo.rst
+++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst
@@ -65,11 +65,11 @@
the kernel start address. Used to convert a virtual address from the
direct kernel map to a physical address.
-vmap_area_list
---------------
+VMALLOC_START
+-------------
-Stores the virtual area list. makedumpfile gets the vmalloc start value
-from this variable and its value is necessary for vmalloc translation.
+Stores the base address of vmalloc area. makedumpfile gets this value
+since is necessary for vmalloc translation.
mem_map
-------
diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst
index 4410384..e8bdf5e 100644
--- a/Documentation/admin-guide/kernel-parameters.rst
+++ b/Documentation/admin-guide/kernel-parameters.rst
@@ -108,6 +108,7 @@
CMA Contiguous Memory Area support is enabled.
DRM Direct Rendering Management support is enabled.
DYNAMIC_DEBUG Build in debug messages and enable them at runtime
+ EARLY Parameter processed too early to be embedded in initrd.
EDD BIOS Enhanced Disk Drive Services (EDD) is enabled
EFI EFI Partitioning (GPT) is enabled
EVM Extended Verification Module
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 31b3a25..623fce7 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -9,7 +9,7 @@
accept_memory=eager can be used to accept all memory
at once during boot.
- acpi= [HW,ACPI,X86,ARM64,RISCV64]
+ acpi= [HW,ACPI,X86,ARM64,RISCV64,EARLY]
Advanced Configuration and Power Interface
Format: { force | on | off | strict | noirq | rsdt |
copy_dsdt }
@@ -26,7 +26,7 @@
See also Documentation/power/runtime_pm.rst, pci=noacpi
- acpi_apic_instance= [ACPI, IOAPIC]
+ acpi_apic_instance= [ACPI,IOAPIC,EARLY]
Format: <int>
2: use 2nd APIC table, if available
1,0: use 1st APIC table
@@ -41,7 +41,7 @@
If set to native, use the device's native backlight mode.
If set to none, disable the ACPI backlight interface.
- acpi_force_32bit_fadt_addr
+ acpi_force_32bit_fadt_addr [ACPI,EARLY]
force FADT to use 32 bit addresses rather than the
64 bit X_* addresses. Some firmware have broken 64
bit addresses for force ACPI ignore these and use
@@ -97,7 +97,7 @@
no: ACPI OperationRegions are not marked as reserved,
no further checks are performed.
- acpi_force_table_verification [HW,ACPI]
+ acpi_force_table_verification [HW,ACPI,EARLY]
Enable table checksum verification during early stage.
By default, this is disabled due to x86 early mapping
size limitation.
@@ -137,7 +137,7 @@
acpi_no_memhotplug [ACPI] Disable memory hotplug. Useful for kdump
kernels.
- acpi_no_static_ssdt [HW,ACPI]
+ acpi_no_static_ssdt [HW,ACPI,EARLY]
Disable installation of static SSDTs at early boot time
By default, SSDTs contained in the RSDT/XSDT will be
installed automatically and they will appear under
@@ -151,7 +151,7 @@
Ignore the ACPI-based watchdog interface (WDAT) and let
a native driver control the watchdog device instead.
- acpi_rsdp= [ACPI,EFI,KEXEC]
+ acpi_rsdp= [ACPI,EFI,KEXEC,EARLY]
Pass the RSDP address to the kernel, mostly used
on machines running EFI runtime service to boot the
second kernel for kdump.
@@ -228,10 +228,10 @@
to assume that this machine's pmtimer latches its value
and always returns good values.
- acpi_sci= [HW,ACPI] ACPI System Control Interrupt trigger mode
+ acpi_sci= [HW,ACPI,EARLY] ACPI System Control Interrupt trigger mode
Format: { level | edge | high | low }
- acpi_skip_timer_override [HW,ACPI]
+ acpi_skip_timer_override [HW,ACPI,EARLY]
Recognize and ignore IRQ0/pin2 Interrupt Override.
For broken nForce2 BIOS resulting in XT-PIC timer.
@@ -266,11 +266,11 @@
behave incorrectly in some ways with respect to system
suspend and resume to be ignored (use wisely).
- acpi_use_timer_override [HW,ACPI]
+ acpi_use_timer_override [HW,ACPI,EARLY]
Use timer override. For some broken Nvidia NF5 boards
that require a timer override, but don't have HPET
- add_efi_memmap [EFI; X86] Include EFI memory map in
+ add_efi_memmap [EFI,X86,EARLY] Include EFI memory map in
kernel's map of available physical RAM.
agp= [AGP]
@@ -307,7 +307,7 @@
do not want to use tracing_snapshot_alloc() as it needs
to be done where GFP_KERNEL allocations are allowed.
- allow_mismatched_32bit_el0 [ARM64]
+ allow_mismatched_32bit_el0 [ARM64,EARLY]
Allow execve() of 32-bit applications and setting of the
PER_LINUX32 personality on systems where only a strict
subset of the CPUs support 32-bit EL0. When this
@@ -351,7 +351,7 @@
This mode requires kvm-amd.avic=1.
(Default when IOMMU HW support is present.)
- amd_pstate= [X86]
+ amd_pstate= [X86,EARLY]
disable
Do not enable amd_pstate as the default
scaling driver for the supported processors
@@ -374,6 +374,11 @@
selects a performance level in this range and appropriate
to the current workload.
+ amd_prefcore=
+ [X86]
+ disable
+ Disable amd-pstate preferred core.
+
amijoy.map= [HW,JOY] Amiga joystick support
Map of devices attached to JOY0DAT and JOY1DAT
Format: <a>,<b>
@@ -391,7 +396,7 @@
not play well with APC CPU idle - disable it if you have
APC and your system crashes randomly.
- apic= [APIC,X86] Advanced Programmable Interrupt Controller
+ apic= [APIC,X86,EARLY] Advanced Programmable Interrupt Controller
Change the output verbosity while booting
Format: { quiet (default) | verbose | debug }
Change the amount of debugging information output
@@ -401,7 +406,7 @@
Format: apic=driver_name
Examples: apic=bigsmp
- apic_extnmi= [APIC,X86] External NMI delivery setting
+ apic_extnmi= [APIC,X86,EARLY] External NMI delivery setting
Format: { bsp (default) | all | none }
bsp: External NMI is delivered only to CPU 0
all: External NMIs are broadcast to all CPUs as a
@@ -508,21 +513,22 @@
bert_disable [ACPI]
Disable BERT OS support on buggy BIOSes.
- bgrt_disable [ACPI][X86]
+ bgrt_disable [ACPI,X86,EARLY]
Disable BGRT to avoid flickering OEM logo.
blkdevparts= Manual partition parsing of block device(s) for
embedded devices based on command line input.
See Documentation/block/cmdline-partition.rst
- boot_delay= Milliseconds to delay each printk during boot.
+ boot_delay= [KNL,EARLY]
+ Milliseconds to delay each printk during boot.
Only works if CONFIG_BOOT_PRINTK_DELAY is enabled,
and you may also have to specify "lpj=". Boot_delay
values larger than 10 seconds (10000) are assumed
erroneous and ignored.
Format: integer
- bootconfig [KNL]
+ bootconfig [KNL,EARLY]
Extended command line options can be added to an initrd
and this will cause the kernel to look for it.
@@ -557,7 +563,7 @@
trust validation.
format: { id:<keyid> | builtin }
- cca= [MIPS] Override the kernel pages' cache coherency
+ cca= [MIPS,EARLY] Override the kernel pages' cache coherency
algorithm. Accepted values range from 0 to 7
inclusive. See arch/mips/include/asm/pgtable-bits.h
for platform specific values (SB1, Loongson3 and
@@ -672,19 +678,13 @@
[X86-64] hpet,tsc
clocksource.arm_arch_timer.evtstrm=
- [ARM,ARM64]
+ [ARM,ARM64,EARLY]
Format: <bool>
Enable/disable the eventstream feature of the ARM
architected timer so that code using WFE-based polling
loops can be debugged more effectively on production
systems.
- clocksource.max_cswd_read_retries= [KNL]
- Number of clocksource_watchdog() retries due to
- external delays before the clock will be marked
- unstable. Defaults to two retries, that is,
- three attempts to read the clock under test.
-
clocksource.verify_n_cpus= [KNL]
Limit the number of CPUs checked for clocksources
marked with CLOCK_SOURCE_VERIFY_PERCPU that
@@ -702,7 +702,7 @@
10 seconds when built into the kernel.
cma=nn[MG]@[start[MG][-end[MG]]]
- [KNL,CMA]
+ [KNL,CMA,EARLY]
Sets the size of kernel global memory area for
contiguous memory allocations and optionally the
placement constraint by the physical address range of
@@ -711,7 +711,7 @@
kernel/dma/contiguous.c
cma_pernuma=nn[MG]
- [KNL,CMA]
+ [KNL,CMA,EARLY]
Sets the size of kernel per-numa memory area for
contiguous memory allocations. A value of 0 disables
per-numa CMA altogether. And If this option is not
@@ -722,7 +722,7 @@
they will fallback to the global default memory area.
numa_cma=<node>:nn[MG][,<node>:nn[MG]]
- [KNL,CMA]
+ [KNL,CMA,EARLY]
Sets the size of kernel numa memory area for
contiguous memory allocations. It will reserve CMA
area for the specified node.
@@ -739,7 +739,7 @@
a hypervisor.
Default: yes
- coherent_pool=nn[KMG] [ARM,KNL]
+ coherent_pool=nn[KMG] [ARM,KNL,EARLY]
Sets the size of memory pool for coherent, atomic dma
allocations, by default set to 256K.
@@ -757,7 +757,7 @@
condev= [HW,S390] console device
conmode=
- con3215_drop= [S390] 3215 console drop mode.
+ con3215_drop= [S390,EARLY] 3215 console drop mode.
Format: y|n|Y|N|1|0
When set to true, drop data on the 3215 console when
the console buffer is full. In this case the
@@ -863,7 +863,7 @@
kernel before the cpufreq driver probes.
cpu_init_udelay=N
- [X86] Delay for N microsec between assert and de-assert
+ [X86,EARLY] Delay for N microsec between assert and de-assert
of APIC INIT to start processors. This delay occurs
on every CPU online, such as boot, and resume from suspend.
Default: 10000
@@ -883,7 +883,7 @@
kernel more unstable.
crashkernel=size[KMG][@offset[KMG]]
- [KNL] Using kexec, Linux can switch to a 'crash kernel'
+ [KNL,EARLY] Using kexec, Linux can switch to a 'crash kernel'
upon panic. This parameter reserves the physical
memory region [offset, offset + size] for that kernel
image. If '@offset' is omitted, then a suitable offset
@@ -954,10 +954,10 @@
Format: <port#>,<type>
See also Documentation/input/devices/joystick-parport.rst
- debug [KNL] Enable kernel debugging (events log level).
+ debug [KNL,EARLY] Enable kernel debugging (events log level).
debug_boot_weak_hash
- [KNL] Enable printing [hashed] pointers early in the
+ [KNL,EARLY] Enable printing [hashed] pointers early in the
boot sequence. If enabled, we use a weak hash instead
of siphash to hash pointers. Use this option if you are
seeing instances of '(___ptrval___)') and need to see a
@@ -974,10 +974,10 @@
will print _a_lot_ more information - normally only
useful to lockdep developers.
- debug_objects [KNL] Enable object debugging
+ debug_objects [KNL,EARLY] Enable object debugging
debug_guardpage_minorder=
- [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this
+ [KNL,EARLY] When CONFIG_DEBUG_PAGEALLOC is set, this
parameter allows control of the order of pages that will
be intentionally kept free (and hence protected) by the
buddy allocator. Bigger value increase the probability
@@ -996,7 +996,7 @@
help tracking down these problems.
debug_pagealloc=
- [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this parameter
+ [KNL,EARLY] When CONFIG_DEBUG_PAGEALLOC is set, this parameter
enables the feature at boot time. By default, it is
disabled and the system will work mostly the same as a
kernel built without CONFIG_DEBUG_PAGEALLOC.
@@ -1004,8 +1004,8 @@
useful to also enable the page_owner functionality.
on: enable the feature
- debugfs= [KNL] This parameter enables what is exposed to userspace
- and debugfs internal clients.
+ debugfs= [KNL,EARLY] This parameter enables what is exposed to
+ userspace and debugfs internal clients.
Format: { on, no-mount, off }
on: All functions are enabled.
no-mount:
@@ -1084,7 +1084,7 @@
dhash_entries= [KNL]
Set number of hash buckets for dentry cache.
- disable_1tb_segments [PPC]
+ disable_1tb_segments [PPC,EARLY]
Disables the use of 1TB hash page table segments. This
causes the kernel to fall back to 256MB segments which
can be useful when debugging issues that require an SLB
@@ -1093,41 +1093,32 @@
disable= [IPV6]
See Documentation/networking/ipv6.rst.
- disable_radix [PPC]
+ disable_radix [PPC,EARLY]
Disable RADIX MMU mode on POWER9
disable_tlbie [PPC]
Disable TLBIE instruction. Currently does not work
with KVM, with HASH MMU, or with coherent accelerators.
- disable_cpu_apicid= [X86,APIC,SMP]
- Format: <int>
- The number of initial APIC ID for the
- corresponding CPU to be disabled at boot,
- mostly used for the kdump 2nd kernel to
- disable BSP to wake up multiple CPUs without
- causing system reset or hang due to sending
- INIT from AP to BSP.
-
- disable_ddw [PPC/PSERIES]
+ disable_ddw [PPC/PSERIES,EARLY]
Disable Dynamic DMA Window support. Use this
to workaround buggy firmware.
disable_ipv6= [IPV6]
See Documentation/networking/ipv6.rst.
- disable_mtrr_cleanup [X86]
+ disable_mtrr_cleanup [X86,EARLY]
The kernel tries to adjust MTRR layout from continuous
to discrete, to make X server driver able to add WB
entry later. This parameter disables that.
- disable_mtrr_trim [X86, Intel and AMD only]
+ disable_mtrr_trim [X86, Intel and AMD only,EARLY]
By default the kernel will trim any uncacheable
memory out of your available memory pool based on
MTRR settings. This parameter disables that behavior,
possibly causing your machine to run very slowly.
- disable_timer_pin_1 [X86]
+ disable_timer_pin_1 [X86,EARLY]
Disable PIN 1 of APIC timer
Can be useful to work around chipset bugs.
@@ -1150,6 +1141,26 @@
The filter can be disabled or changed to another
driver later using sysfs.
+ reg_file_data_sampling=
+ [X86] Controls mitigation for Register File Data
+ Sampling (RFDS) vulnerability. RFDS is a CPU
+ vulnerability which may allow userspace to infer
+ kernel data values previously stored in floating point
+ registers, vector registers, or integer registers.
+ RFDS only affects Intel Atom processors.
+
+ on: Turns ON the mitigation.
+ off: Turns OFF the mitigation.
+
+ This parameter overrides the compile time default set
+ by CONFIG_MITIGATION_RFDS. Mitigation cannot be
+ disabled when other VERW based mitigations (like MDS)
+ are enabled. In order to disable RFDS mitigation all
+ VERW based mitigations need to be disabled.
+
+ For details see:
+ Documentation/admin-guide/hw-vuln/reg-file-data-sampling.rst
+
driver_async_probe= [KNL]
List of driver names to be probed asynchronously. *
matches with all driver names. If * is specified, the
@@ -1162,22 +1173,16 @@
panels may send no or incorrect EDID data sets.
This parameter allows to specify an EDID data sets
in the /lib/firmware directory that are used instead.
- Generic built-in EDID data sets are used, if one of
- edid/1024x768.bin, edid/1280x1024.bin,
- edid/1680x1050.bin, or edid/1920x1080.bin is given
- and no file with the same name exists. Details and
- instructions how to build your own EDID data are
- available in Documentation/admin-guide/edid.rst. An EDID
- data set will only be used for a particular connector,
- if its name and a colon are prepended to the EDID
- name. Each connector may use a unique EDID data
- set by separating the files with a comma. An EDID
+ An EDID data set will only be used for a particular
+ connector, if its name and a colon are prepended to
+ the EDID name. Each connector may use a unique EDID
+ data set by separating the files with a comma. An EDID
data set with no connector name will be used for
any connectors not explicitly specified.
dscc4.setup= [NET]
- dt_cpu_ftrs= [PPC]
+ dt_cpu_ftrs= [PPC,EARLY]
Format: {"off" | "known"}
Control how the dt_cpu_ftrs device-tree binding is
used for CPU feature discovery and setup (if it
@@ -1197,12 +1202,12 @@
Documentation/admin-guide/dynamic-debug-howto.rst
for details.
- early_ioremap_debug [KNL]
+ early_ioremap_debug [KNL,EARLY]
Enable debug messages in early_ioremap support. This
is useful for tracking down temporary early mappings
which are not unmapped.
- earlycon= [KNL] Output early console device and options.
+ earlycon= [KNL,EARLY] Output early console device and options.
When used with no options, the early console is
determined by stdout-path property in device tree's
@@ -1338,7 +1343,7 @@
address must be provided, and the serial port must
already be setup and configured.
- earlyprintk= [X86,SH,ARM,M68k,S390]
+ earlyprintk= [X86,SH,ARM,M68k,S390,UM,EARLY]
earlyprintk=vga
earlyprintk=sclp
earlyprintk=xen
@@ -1396,7 +1401,7 @@
edd= [EDD]
Format: {"off" | "on" | "skip[mbr]"}
- efi= [EFI]
+ efi= [EFI,EARLY]
Format: { "debug", "disable_early_pci_dma",
"nochunk", "noruntime", "nosoftreserve",
"novamap", "no_disable_early_pci_dma" }
@@ -1417,13 +1422,13 @@
no_disable_early_pci_dma: Leave the busmaster bit set
on all PCI bridges while in the EFI boot stub
- efi_no_storage_paranoia [EFI; X86]
+ efi_no_storage_paranoia [EFI,X86,EARLY]
Using this parameter you can use more than 50% of
your efi variable storage. Use this parameter only if
you are really sure that your UEFI does sane gc and
fulfills the spec otherwise your board may brick.
- efi_fake_mem= nn[KMG]@ss[KMG]:aa[,nn[KMG]@ss[KMG]:aa,..] [EFI; X86]
+ efi_fake_mem= nn[KMG]@ss[KMG]:aa[,nn[KMG]@ss[KMG]:aa,..] [EFI,X86,EARLY]
Add arbitrary attribute to specific memory range by
updating original EFI memory map.
Region of memory which aa attribute is added to is
@@ -1454,7 +1459,7 @@
eisa_irq_edge= [PARISC,HW]
See header of drivers/parisc/eisa.c.
- ekgdboc= [X86,KGDB] Allow early kernel console debugging
+ ekgdboc= [X86,KGDB,EARLY] Allow early kernel console debugging
Format: ekgdboc=kbd
This is designed to be used in conjunction with
@@ -1469,13 +1474,13 @@
See comment before function elanfreq_setup() in
arch/x86/kernel/cpu/cpufreq/elanfreq.c.
- elfcorehdr=[size[KMG]@]offset[KMG] [PPC,SH,X86,S390]
+ elfcorehdr=[size[KMG]@]offset[KMG] [PPC,SH,X86,S390,EARLY]
Specifies physical address of start of kernel core
image elf header and optionally the size. Generally
kexec loader will pass this option to capture kernel.
See Documentation/admin-guide/kdump/kdump.rst for details.
- enable_mtrr_cleanup [X86]
+ enable_mtrr_cleanup [X86,EARLY]
The kernel tries to adjust MTRR layout from continuous
to discrete, to make X server driver able to add WB
entry later. This parameter enables that.
@@ -1508,7 +1513,7 @@
Permit 'security.evm' to be updated regardless of
current integrity status.
- early_page_ext [KNL] Enforces page_ext initialization to earlier
+ early_page_ext [KNL,EARLY] Enforces page_ext initialization to earlier
stages so cover more early boot allocations.
Please note that as side effect some optimizations
might be disabled to achieve that (e.g. parallelized
@@ -1539,6 +1544,12 @@
Warning: use of this parameter will taint the kernel
and may cause unknown problems.
+ fred= [X86-64]
+ Enable/disable Flexible Return and Event Delivery.
+ Format: { on | off }
+ on: enable FRED when it's present.
+ off: disable FRED, the default setting.
+
ftrace=[tracer]
[FTRACE] will set and start the specified tracer
as early as possible in order to facilitate early
@@ -1561,12 +1572,28 @@
The above will cause the "foo" tracing instance to trigger
a snapshot at the end of boot up.
- ftrace_dump_on_oops[=orig_cpu]
+ ftrace_dump_on_oops[=2(orig_cpu) | =<instance>][,<instance> |
+ ,<instance>=2(orig_cpu)]
[FTRACE] will dump the trace buffers on oops.
- If no parameter is passed, ftrace will dump
- buffers of all CPUs, but if you pass orig_cpu, it will
- dump only the buffer of the CPU that triggered the
- oops.
+ If no parameter is passed, ftrace will dump global
+ buffers of all CPUs, if you pass 2 or orig_cpu, it
+ will dump only the buffer of the CPU that triggered
+ the oops, or the specific instance will be dumped if
+ its name is passed. Multiple instance dump is also
+ supported, and instances are separated by commas. Each
+ instance supports only dump on CPU that triggered the
+ oops by passing 2 or orig_cpu to it.
+
+ ftrace_dump_on_oops=foo=orig_cpu
+
+ The above will dump only the buffer of "foo" instance
+ on CPU that triggered the oops.
+
+ ftrace_dump_on_oops,foo,bar=orig_cpu
+
+ The above will dump global buffer on all CPUs, the
+ buffer of "foo" instance on all CPUs and the buffer
+ of "bar" instance on CPU that triggered the oops.
ftrace_filter=[function-list]
[FTRACE] Limit the functions traced by the function
@@ -1600,7 +1627,7 @@
can be changed at run time by the max_graph_depth file
in the tracefs tracing directory. default: 0 (no limit)
- fw_devlink= [KNL] Create device links between consumer and supplier
+ fw_devlink= [KNL,EARLY] Create device links between consumer and supplier
devices by scanning the firmware to infer the
consumer/supplier relationships. This feature is
especially useful when drivers are loaded as modules as
@@ -1619,12 +1646,12 @@
rpm -- Like "on", but also use to order runtime PM.
fw_devlink.strict=<bool>
- [KNL] Treat all inferred dependencies as mandatory
+ [KNL,EARLY] Treat all inferred dependencies as mandatory
dependencies. This only applies for fw_devlink=on|rpm.
Format: <bool>
fw_devlink.sync_state =
- [KNL] When all devices that could probe have finished
+ [KNL,EARLY] When all devices that could probe have finished
probing, this parameter controls what to do with
devices that haven't yet received their sync_state()
calls.
@@ -1645,12 +1672,12 @@
gamma= [HW,DRM]
- gart_fix_e820= [X86-64] disable the fix e820 for K8 GART
+ gart_fix_e820= [X86-64,EARLY] disable the fix e820 for K8 GART
Format: off | on
default: on
gather_data_sampling=
- [X86,INTEL] Control the Gather Data Sampling (GDS)
+ [X86,INTEL,EARLY] Control the Gather Data Sampling (GDS)
mitigation.
Gather Data Sampling is a hardware vulnerability which
@@ -1748,7 +1775,18 @@
(that will set all pages holding image data
during restoration read-only).
- highmem=nn[KMG] [KNL,BOOT] forces the highmem zone to have an exact
+ hibernate.compressor= [HIBERNATION] Compression algorithm to be
+ used with hibernation.
+ Format: { lzo | lz4 }
+ Default: lzo
+
+ lzo: Select LZO compression algorithm to
+ compress/decompress hibernation image.
+
+ lz4: Select LZ4 compression algorithm to
+ compress/decompress hibernation image.
+
+ highmem=nn[KMG] [KNL,BOOT,EARLY] forces the highmem zone to have an exact
size of <nn>. This works even on boxes that have no
highmem otherwise. This also works to reduce highmem
size on bigger boxes.
@@ -1759,7 +1797,7 @@
hlt [BUGS=ARM,SH]
- hostname= [KNL] Set the hostname (aka UTS nodename).
+ hostname= [KNL,EARLY] Set the hostname (aka UTS nodename).
Format: <string>
This allows setting the system's hostname during early
startup. This sets the name returned by gethostname.
@@ -1804,7 +1842,7 @@
Documentation/admin-guide/mm/hugetlbpage.rst.
Format: size[KMG]
- hugetlb_cma= [HW,CMA] The size of a CMA area used for allocation
+ hugetlb_cma= [HW,CMA,EARLY] The size of a CMA area used for allocation
of gigantic hugepages. Or using node format, the size
of a CMA area per node can be specified.
Format: nn[KMGTPE] or (node format)
@@ -1850,9 +1888,10 @@
If specified, z/VM IUCV HVC accepts connections
from listed z/VM user IDs only.
- hv_nopvspin [X86,HYPER_V] Disables the paravirt spinlock optimizations
- which allow the hypervisor to 'idle' the
- guest on lock contention.
+ hv_nopvspin [X86,HYPER_V,EARLY]
+ Disables the paravirt spinlock optimizations
+ which allow the hypervisor to 'idle' the guest
+ on lock contention.
i2c_bus= [HW] Override the default board specific I2C bus speed
or register an additional I2C bus that is not
@@ -1917,7 +1956,7 @@
Format: <io>[,<membase>[,<icn_id>[,<icn_id2>]]]
- idle= [X86]
+ idle= [X86,EARLY]
Format: idle=poll, idle=halt, idle=nomwait
Poll forces a polling idle loop that can slightly
improve the performance of waking up a idle CPU, but
@@ -1973,7 +2012,7 @@
mode generally follows that for the NaN encoding,
except where unsupported by hardware.
- ignore_loglevel [KNL]
+ ignore_loglevel [KNL,EARLY]
Ignore loglevel setting - this will print /all/
kernel messages to the console. Useful for debugging.
We also add it as printk module parameter, so users
@@ -2091,21 +2130,21 @@
unpacking being completed before device_ and
late_ initcalls.
- initrd= [BOOT] Specify the location of the initial ramdisk
+ initrd= [BOOT,EARLY] Specify the location of the initial ramdisk
- initrdmem= [KNL] Specify a physical address and size from which to
+ initrdmem= [KNL,EARLY] Specify a physical address and size from which to
load the initrd. If an initrd is compiled in or
specified in the bootparams, it takes priority over this
setting.
Format: ss[KMG],nn[KMG]
Default is 0, 0
- init_on_alloc= [MM] Fill newly allocated pages and heap objects with
+ init_on_alloc= [MM,EARLY] Fill newly allocated pages and heap objects with
zeroes.
Format: 0 | 1
Default set by CONFIG_INIT_ON_ALLOC_DEFAULT_ON.
- init_on_free= [MM] Fill freed pages and heap objects with zeroes.
+ init_on_free= [MM,EARLY] Fill freed pages and heap objects with zeroes.
Format: 0 | 1
Default set by CONFIG_INIT_ON_FREE_DEFAULT_ON.
@@ -2161,7 +2200,7 @@
0 disables intel_idle and fall back on acpi_idle.
1 to 9 specify maximum depth of C-state.
- intel_pstate= [X86]
+ intel_pstate= [X86,EARLY]
disable
Do not enable intel_pstate as the default
scaling driver for the supported processors
@@ -2205,7 +2244,7 @@
Allow per-logical-CPU P-State performance control limits using
cpufreq sysfs interface
- intremap= [X86-64, Intel-IOMMU]
+ intremap= [X86-64,Intel-IOMMU,EARLY]
on enable Interrupt Remapping (default)
off disable Interrupt Remapping
nosid disable Source ID checking
@@ -2217,7 +2256,7 @@
strict regions from userspace.
relaxed
- iommu= [X86]
+ iommu= [X86,EARLY]
off
force
noforce
@@ -2232,7 +2271,7 @@
nobypass [PPC/POWERNV]
Disable IOMMU bypass, using IOMMU for PCI devices.
- iommu.forcedac= [ARM64, X86] Control IOVA allocation for PCI devices.
+ iommu.forcedac= [ARM64,X86,EARLY] Control IOVA allocation for PCI devices.
Format: { "0" | "1" }
0 - Try to allocate a 32-bit DMA address first, before
falling back to the full range if needed.
@@ -2240,7 +2279,7 @@
forcing Dual Address Cycle for PCI cards supporting
greater than 32-bit addressing.
- iommu.strict= [ARM64, X86, S390] Configure TLB invalidation behaviour
+ iommu.strict= [ARM64,X86,S390,EARLY] Configure TLB invalidation behaviour
Format: { "0" | "1" }
0 - Lazy mode.
Request that DMA unmap operations use deferred
@@ -2256,7 +2295,7 @@
legacy driver-specific options takes precedence.
iommu.passthrough=
- [ARM64, X86] Configure DMA to bypass the IOMMU by default.
+ [ARM64,X86,EARLY] Configure DMA to bypass the IOMMU by default.
Format: { "0" | "1" }
0 - Use IOMMU translation for DMA.
1 - Bypass the IOMMU for DMA.
@@ -2266,7 +2305,7 @@
See comment before marvel_specify_io7 in
arch/alpha/kernel/core_marvel.c.
- io_delay= [X86] I/O delay method
+ io_delay= [X86,EARLY] I/O delay method
0x80
Standard port 0x80 based delay
0xed
@@ -2279,28 +2318,28 @@
ip= [IP_PNP]
See Documentation/admin-guide/nfs/nfsroot.rst.
- ipcmni_extend [KNL] Extend the maximum number of unique System V
+ ipcmni_extend [KNL,EARLY] Extend the maximum number of unique System V
IPC identifiers from 32,768 to 16,777,216.
irqaffinity= [SMP] Set the default irq affinity mask
The argument is a cpu list, as described above.
irqchip.gicv2_force_probe=
- [ARM, ARM64]
+ [ARM,ARM64,EARLY]
Format: <bool>
Force the kernel to look for the second 4kB page
of a GICv2 controller even if the memory range
exposed by the device tree is too small.
irqchip.gicv3_nolpi=
- [ARM, ARM64]
+ [ARM,ARM64,EARLY]
Force the kernel to ignore the availability of
LPIs (and by consequence ITSs). Intended for system
that use the kernel as a bootloader, and thus want
to let secondary kernels in charge of setting up
LPIs.
- irqchip.gicv3_pseudo_nmi= [ARM64]
+ irqchip.gicv3_pseudo_nmi= [ARM64,EARLY]
Enables support for pseudo-NMIs in the kernel. This
requires the kernel to be built with
CONFIG_ARM64_PSEUDO_NMI.
@@ -2445,7 +2484,7 @@
parameter KASAN will print report only for the first
invalid access.
- keep_bootcon [KNL]
+ keep_bootcon [KNL,EARLY]
Do not unregister boot console at start. This is only
useful for debugging when something happens in the window
between unregistering the boot console and initializing
@@ -2453,7 +2492,7 @@
keepinitrd [HW,ARM] See retain_initrd.
- kernelcore= [KNL,X86,IA-64,PPC]
+ kernelcore= [KNL,X86,IA-64,PPC,EARLY]
Format: nn[KMGTPE] | nn% | "mirror"
This parameter specifies the amount of memory usable by
the kernel for non-movable allocations. The requested
@@ -2478,7 +2517,7 @@
for Movable pages. "nn[KMGTPE]", "nn%", and "mirror"
are exclusive, so you cannot specify multiple forms.
- kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port.
+ kgdbdbgp= [KGDB,HW,EARLY] kgdb over EHCI usb debug port.
Format: <Controller#>[,poll interval]
The controller # is the number of the ehci usb debug
port as it is probed via PCI. The poll interval is
@@ -2499,7 +2538,7 @@
kms, kbd format: kms,kbd
kms, kbd and serial format: kms,kbd,<ser_dev>[,baud]
- kgdboc_earlycon= [KGDB,HW]
+ kgdboc_earlycon= [KGDB,HW,EARLY]
If the boot console provides the ability to read
characters and can work in polling mode, you can use
this parameter to tell kgdb to use it as a backend
@@ -2514,14 +2553,14 @@
blank and the first boot console that implements
read() will be picked.
- kgdbwait [KGDB] Stop kernel execution and enter the
+ kgdbwait [KGDB,EARLY] Stop kernel execution and enter the
kernel debugger at the earliest opportunity.
kmac= [MIPS] Korina ethernet MAC address.
Configure the RouterBoard 532 series on-chip
Ethernet adapter MAC address.
- kmemleak= [KNL] Boot-time kmemleak enable/disable
+ kmemleak= [KNL,EARLY] Boot-time kmemleak enable/disable
Valid arguments: on, off
Default: on
Built with CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=y,
@@ -2540,8 +2579,8 @@
See also Documentation/trace/kprobetrace.rst "Kernel
Boot Parameter" section.
- kpti= [ARM64] Control page table isolation of user
- and kernel address spaces.
+ kpti= [ARM64,EARLY] Control page table isolation of
+ user and kernel address spaces.
Default: enabled on cores which need mitigation.
0: force disabled
1: force enabled
@@ -2618,7 +2657,8 @@
for NPT.
kvm-arm.mode=
- [KVM,ARM] Select one of KVM/arm64's modes of operation.
+ [KVM,ARM,EARLY] Select one of KVM/arm64's modes of
+ operation.
none: Forcefully disable KVM.
@@ -2638,22 +2678,22 @@
used with extreme caution.
kvm-arm.vgic_v3_group0_trap=
- [KVM,ARM] Trap guest accesses to GICv3 group-0
+ [KVM,ARM,EARLY] Trap guest accesses to GICv3 group-0
system registers
kvm-arm.vgic_v3_group1_trap=
- [KVM,ARM] Trap guest accesses to GICv3 group-1
+ [KVM,ARM,EARLY] Trap guest accesses to GICv3 group-1
system registers
kvm-arm.vgic_v3_common_trap=
- [KVM,ARM] Trap guest accesses to GICv3 common
+ [KVM,ARM,EARLY] Trap guest accesses to GICv3 common
system registers
kvm-arm.vgic_v4_enable=
- [KVM,ARM] Allow use of GICv4 for direct injection of
- LPIs.
+ [KVM,ARM,EARLY] Allow use of GICv4 for direct
+ injection of LPIs.
- kvm_cma_resv_ratio=n [PPC]
+ kvm_cma_resv_ratio=n [PPC,EARLY]
Reserves given percentage from system memory area for
contiguous memory allocation for KVM hash pagetable
allocation.
@@ -2706,7 +2746,7 @@
(enabled). Disable by KVM if hardware lacks support
for it.
- l1d_flush= [X86,INTEL]
+ l1d_flush= [X86,INTEL,EARLY]
Control mitigation for L1D based snooping vulnerability.
Certain CPUs are vulnerable to an exploit against CPU
@@ -2723,7 +2763,7 @@
on - enable the interface for the mitigation
- l1tf= [X86] Control mitigation of the L1TF vulnerability on
+ l1tf= [X86,EARLY] Control mitigation of the L1TF vulnerability on
affected CPUs
The kernel PTE inversion protection is unconditionally
@@ -2792,7 +2832,7 @@
l3cr= [PPC]
- lapic [X86-32,APIC] Enable the local APIC even if BIOS
+ lapic [X86-32,APIC,EARLY] Enable the local APIC even if BIOS
disabled it.
lapic= [X86,APIC] Do not use TSC deadline
@@ -2800,7 +2840,7 @@
back to the programmable timer unit in the LAPIC.
Format: notscdeadline
- lapic_timer_c2_ok [X86,APIC] trust the local apic timer
+ lapic_timer_c2_ok [X86,APIC,EARLY] trust the local apic timer
in C2 power state.
libata.dma= [LIBATA] DMA control
@@ -2924,7 +2964,7 @@
lockd.nlm_udpport=M [NFS] Assign UDP port.
Format: <integer>
- lockdown= [SECURITY]
+ lockdown= [SECURITY,EARLY]
{ integrity | confidentiality }
Enable the kernel lockdown feature. If set to
integrity, kernel features that allow userland to
@@ -3031,7 +3071,8 @@
logibm.irq= [HW,MOUSE] Logitech Bus Mouse Driver
Format: <irq>
- loglevel= All Kernel Messages with a loglevel smaller than the
+ loglevel= [KNL,EARLY]
+ All Kernel Messages with a loglevel smaller than the
console loglevel will be printed to the console. It can
also be changed with klogd or other programs. The
loglevels are defined as follows:
@@ -3045,13 +3086,15 @@
6 (KERN_INFO) informational
7 (KERN_DEBUG) debug-level messages
- log_buf_len=n[KMG] Sets the size of the printk ring buffer,
- in bytes. n must be a power of two and greater
- than the minimal size. The minimal size is defined
- by LOG_BUF_SHIFT kernel config parameter. There is
- also CONFIG_LOG_CPU_MAX_BUF_SHIFT config parameter
- that allows to increase the default size depending on
- the number of CPUs. See init/Kconfig for more details.
+ log_buf_len=n[KMG] [KNL,EARLY]
+ Sets the size of the printk ring buffer, in bytes.
+ n must be a power of two and greater than the
+ minimal size. The minimal size is defined by
+ LOG_BUF_SHIFT kernel config parameter. There
+ is also CONFIG_LOG_CPU_MAX_BUF_SHIFT config
+ parameter that allows to increase the default size
+ depending on the number of CPUs. See init/Kconfig
+ for more details.
logo.nologo [FB] Disables display of the built-in Linux logo.
This may be used to provide more screen space for
@@ -3109,7 +3152,7 @@
max_addr=nn[KMG] [KNL,BOOT,IA-64] All physical memory greater
than or equal to this physical address is ignored.
- maxcpus= [SMP] Maximum number of processors that an SMP kernel
+ maxcpus= [SMP,EARLY] Maximum number of processors that an SMP kernel
will bring up during bootup. maxcpus=n : n >= 0 limits
the kernel to bring up 'n' processors. Surely after
bootup you can bring up the other plugged cpu by executing
@@ -3136,7 +3179,7 @@
Format: <first>,<last>
Specifies range of consoles to be captured by the MDA.
- mds= [X86,INTEL]
+ mds= [X86,INTEL,EARLY]
Control mitigation for the Micro-architectural Data
Sampling (MDS) vulnerability.
@@ -3168,11 +3211,12 @@
For details see: Documentation/admin-guide/hw-vuln/mds.rst
- mem=nn[KMG] [HEXAGON] Set the memory size.
+ mem=nn[KMG] [HEXAGON,EARLY] Set the memory size.
Must be specified, otherwise memory size will be 0.
- mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory
- Amount of memory to be used in cases as follows:
+ mem=nn[KMG] [KNL,BOOT,EARLY] Force usage of a specific amount
+ of memory Amount of memory to be used in cases
+ as follows:
1 for test;
2 when the kernel is not able to see the whole system memory;
@@ -3196,8 +3240,8 @@
if system memory of hypervisor is not sufficient.
mem=nn[KMG]@ss[KMG]
- [ARM,MIPS] - override the memory layout reported by
- firmware.
+ [ARM,MIPS,EARLY] - override the memory layout
+ reported by firmware.
Define a memory region of size nn[KMG] starting at
ss[KMG].
Multiple different regions can be specified with
@@ -3206,7 +3250,7 @@
mem=nopentium [BUGS=X86-32] Disable usage of 4MB pages for kernel
memory.
- memblock=debug [KNL] Enable memblock debug messages.
+ memblock=debug [KNL,EARLY] Enable memblock debug messages.
memchunk=nn[KMG]
[KNL,SH] Allow user to override the default size for
@@ -3220,14 +3264,14 @@
option.
See Documentation/admin-guide/mm/memory-hotplug.rst.
- memmap=exactmap [KNL,X86] Enable setting of an exact
+ memmap=exactmap [KNL,X86,EARLY] Enable setting of an exact
E820 memory map, as specified by the user.
Such memmap=exactmap lines can be constructed based on
BIOS output or other requirements. See the memmap=nn@ss
option description.
memmap=nn[KMG]@ss[KMG]
- [KNL, X86, MIPS, XTENSA] Force usage of a specific region of memory.
+ [KNL, X86,MIPS,XTENSA,EARLY] Force usage of a specific region of memory.
Region of memory to be used is from ss to ss+nn.
If @ss[KMG] is omitted, it is equivalent to mem=nn[KMG],
which limits max address to nn[KMG].
@@ -3237,11 +3281,11 @@
memmap=100M@2G,100M#3G,1G!1024G
memmap=nn[KMG]#ss[KMG]
- [KNL,ACPI] Mark specific memory as ACPI data.
+ [KNL,ACPI,EARLY] Mark specific memory as ACPI data.
Region of memory to be marked is from ss to ss+nn.
memmap=nn[KMG]$ss[KMG]
- [KNL,ACPI] Mark specific memory as reserved.
+ [KNL,ACPI,EARLY] Mark specific memory as reserved.
Region of memory to be reserved is from ss to ss+nn.
Example: Exclude memory from 0x18690000-0x1869ffff
memmap=64K$0x18690000
@@ -3251,14 +3295,14 @@
like Grub2, otherwise '$' and the following number
will be eaten.
- memmap=nn[KMG]!ss[KMG]
+ memmap=nn[KMG]!ss[KMG,EARLY]
[KNL,X86] Mark specific memory as protected.
Region of memory to be used, from ss to ss+nn.
The memory region may be marked as e820 type 12 (0xc)
and is NVDIMM or ADR memory.
memmap=<size>%<offset>-<oldtype>+<newtype>
- [KNL,ACPI] Convert memory within the specified region
+ [KNL,ACPI,EARLY] Convert memory within the specified region
from <oldtype> to <newtype>. If "-<oldtype>" is left
out, the whole region will be marked as <newtype>,
even if previously unavailable. If "+<newtype>" is left
@@ -3266,7 +3310,7 @@
specified as e820 types, e.g., 1 = RAM, 2 = reserved,
3 = ACPI, 12 = PRAM.
- memory_corruption_check=0/1 [X86]
+ memory_corruption_check=0/1 [X86,EARLY]
Some BIOSes seem to corrupt the first 64k of
memory when doing things like suspend/resume.
Setting this option will scan the memory
@@ -3278,13 +3322,13 @@
affects the same memory, you can use memmap=
to prevent the kernel from using that memory.
- memory_corruption_check_size=size [X86]
+ memory_corruption_check_size=size [X86,EARLY]
By default it checks for corruption in the low
64k, making this memory unavailable for normal
use. Use this parameter to scan for
corruption in more or less memory.
- memory_corruption_check_period=seconds [X86]
+ memory_corruption_check_period=seconds [X86,EARLY]
By default it checks for corruption every 60
seconds. Use this parameter to check at some
other rate. 0 disables periodic checking.
@@ -3308,7 +3352,7 @@
Note that even when enabled, there are a few cases where
the feature is not effective.
- memtest= [KNL,X86,ARM,M68K,PPC,RISCV] Enable memtest
+ memtest= [KNL,X86,ARM,M68K,PPC,RISCV,EARLY] Enable memtest
Format: <integer>
default : 0 <disable>
Specifies the number of memtest passes to be
@@ -3320,9 +3364,7 @@
mem_encrypt= [X86-64] AMD Secure Memory Encryption (SME) control
Valid arguments: on, off
- Default (depends on kernel configuration option):
- on (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y)
- off (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=n)
+ Default: off
mem_encrypt=on: Activate SME
mem_encrypt=off: Do not activate SME
@@ -3376,7 +3418,7 @@
https://repo.or.cz/w/linux-2.6/mini2440.git
mitigations=
- [X86,PPC,S390,ARM64] Control optional mitigations for
+ [X86,PPC,S390,ARM64,EARLY] Control optional mitigations for
CPU vulnerabilities. This is a set of curated,
arch-independent options, each of which is an
aggregation of existing arch-specific options.
@@ -3398,7 +3440,9 @@
nospectre_bhb [ARM64]
nospectre_v1 [X86,PPC]
nospectre_v2 [X86,PPC,S390,ARM64]
+ reg_file_data_sampling=off [X86]
retbleed=off [X86]
+ spec_rstack_overflow=off [X86]
spec_store_bypass_disable=off [X86,PPC]
spectre_v2_user=off [X86]
srbds=off [X86,INTEL]
@@ -3429,7 +3473,7 @@
retbleed=auto,nosmt [X86]
mminit_loglevel=
- [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this
+ [KNL,EARLY] When CONFIG_DEBUG_MEMORY_INIT is set, this
parameter allows control of the logging verbosity for
the additional memory initialisation checks. A value
of 0 disables mminit logging and a level of 4 will
@@ -3437,7 +3481,7 @@
so loglevel=8 may also need to be specified.
mmio_stale_data=
- [X86,INTEL] Control mitigation for the Processor
+ [X86,INTEL,EARLY] Control mitigation for the Processor
MMIO Stale Data vulnerabilities.
Processor MMIO Stale Data is a class of
@@ -3512,7 +3556,7 @@
mousedev.yres= [MOUSE] Vertical screen resolution, used for devices
reporting absolute coordinates, such as tablets
- movablecore= [KNL,X86,IA-64,PPC]
+ movablecore= [KNL,X86,IA-64,PPC,EARLY]
Format: nn[KMGTPE] | nn%
This parameter is the complement to kernelcore=, it
specifies the amount of memory used for migratable
@@ -3523,7 +3567,7 @@
that the amount of memory usable for all allocations
is not too small.
- movable_node [KNL] Boot-time switch to make hotplugable memory
+ movable_node [KNL,EARLY] Boot-time switch to make hotplugable memory
NUMA nodes to be movable. This means that the memory
of such nodes will be usable only for movable
allocations which rules out almost all kernel
@@ -3547,21 +3591,21 @@
[HW] Make the MicroTouch USB driver use raw coordinates
('y', default) or cooked coordinates ('n')
- mtrr=debug [X86]
+ mtrr=debug [X86,EARLY]
Enable printing debug information related to MTRR
registers at boot time.
- mtrr_chunk_size=nn[KMG] [X86]
+ mtrr_chunk_size=nn[KMG,X86,EARLY]
used for mtrr cleanup. It is largest continuous chunk
that could hold holes aka. UC entries.
- mtrr_gran_size=nn[KMG] [X86]
+ mtrr_gran_size=nn[KMG,X86,EARLY]
Used for mtrr cleanup. It is granularity of mtrr block.
Default is 1.
Large value could prevent small alignment from
using up MTRRs.
- mtrr_spare_reg_nr=n [X86]
+ mtrr_spare_reg_nr=n [X86,EARLY]
Format: <integer>
Range: 0,7 : spare reg number
Default : 1
@@ -3747,27 +3791,23 @@
emulation library even if a 387 maths coprocessor
is present.
- no4lvl [RISCV] Disable 4-level and 5-level paging modes. Forces
- kernel to use 3-level paging instead.
+ no4lvl [RISCV,EARLY] Disable 4-level and 5-level paging modes.
+ Forces kernel to use 3-level paging instead.
- no5lvl [X86-64,RISCV] Disable 5-level paging mode. Forces
+ no5lvl [X86-64,RISCV,EARLY] Disable 5-level paging mode. Forces
kernel to use 4-level paging instead.
- noaliencache [MM, NUMA, SLAB] Disables the allocation of alien
- caches in the slab allocator. Saves per-node memory,
- but will impact performance.
-
noalign [KNL,ARM]
- noaltinstr [S390] Disables alternative instructions patching
- (CPU alternatives feature).
+ noaltinstr [S390,EARLY] Disables alternative instructions
+ patching (CPU alternatives feature).
- noapic [SMP,APIC] Tells the kernel to not make use of any
+ noapic [SMP,APIC,EARLY] Tells the kernel to not make use of any
IOAPICs that may be present in the system.
noautogroup Disable scheduler automatic task group creation.
- nocache [ARM]
+ nocache [ARM,EARLY]
no_console_suspend
[HW] Never suspend the console
@@ -3785,13 +3825,13 @@
turn on/off it dynamically.
no_debug_objects
- [KNL] Disable object debugging
+ [KNL,EARLY] Disable object debugging
nodsp [SH] Disable hardware DSP at boot time.
- noefi Disable EFI runtime services support.
+ noefi [EFI,EARLY] Disable EFI runtime services support.
- no_entry_flush [PPC] Don't flush the L1-D cache when entering the kernel.
+ no_entry_flush [PPC,EARLY] Don't flush the L1-D cache when entering the kernel.
noexec [IA-64]
@@ -3822,6 +3862,7 @@
real-time systems.
no_hash_pointers
+ [KNL,EARLY]
Force pointers printed to the console or buffers to be
unhashed. By default, when a pointer is printed via %p
format string, that pointer is "hashed", i.e. obscured
@@ -3846,9 +3887,9 @@
the impact of the sleep instructions. This is also
useful when using JTAG debugger.
- nohugeiomap [KNL,X86,PPC,ARM64] Disable kernel huge I/O mappings.
+ nohugeiomap [KNL,X86,PPC,ARM64,EARLY] Disable kernel huge I/O mappings.
- nohugevmalloc [KNL,X86,PPC,ARM64] Disable kernel huge vmalloc mappings.
+ nohugevmalloc [KNL,X86,PPC,ARM64,EARLY] Disable kernel huge vmalloc mappings.
nohz= [KNL] Boottime enable/disable dynamic ticks
Valid arguments: on, off
@@ -3870,13 +3911,13 @@
noinitrd [RAM] Tells the kernel not to load any configured
initial RAM disk.
- nointremap [X86-64, Intel-IOMMU] Do not enable interrupt
+ nointremap [X86-64,Intel-IOMMU,EARLY] Do not enable interrupt
remapping.
[Deprecated - use intremap=off]
nointroute [IA-64]
- noinvpcid [X86] Disable the INVPCID cpu feature.
+ noinvpcid [X86,EARLY] Disable the INVPCID cpu feature.
noiotrap [SH] Disables trapped I/O port accesses.
@@ -3887,19 +3928,19 @@
nojitter [IA-64] Disables jitter checking for ITC timers.
- nokaslr [KNL]
+ nokaslr [KNL,EARLY]
When CONFIG_RANDOMIZE_BASE is set, this disables
kernel and module base offset ASLR (Address Space
Layout Randomization).
- no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page
+ no-kvmapf [X86,KVM,EARLY] Disable paravirtualized asynchronous page
fault handling.
- no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver
+ no-kvmclock [X86,KVM,EARLY] Disable paravirtualized KVM clock driver
- nolapic [X86-32,APIC] Do not enable or use the local APIC.
+ nolapic [X86-32,APIC,EARLY] Do not enable or use the local APIC.
- nolapic_timer [X86-32,APIC] Do not use the local APIC timer.
+ nolapic_timer [X86-32,APIC,EARLY] Do not use the local APIC timer.
nomca [IA-64] Disable machine check abort handling
@@ -3924,23 +3965,23 @@
shutdown the other cpus. Instead use the REBOOT_VECTOR
irq.
- nopat [X86] Disable PAT (page attribute table extension of
+ nopat [X86,EARLY] Disable PAT (page attribute table extension of
pagetables) support.
- nopcid [X86-64] Disable the PCID cpu feature.
+ nopcid [X86-64,EARLY] Disable the PCID cpu feature.
nopku [X86] Disable Memory Protection Keys CPU feature found
in some Intel CPUs.
- nopti [X86-64]
+ nopti [X86-64,EARLY]
Equivalent to pti=off
- nopv= [X86,XEN,KVM,HYPER_V,VMWARE]
+ nopv= [X86,XEN,KVM,HYPER_V,VMWARE,EARLY]
Disables the PV optimizations forcing the guest to run
as generic guest with no PV drivers. Currently support
XEN HVM, KVM, HYPER_V and VMWARE guest.
- nopvspin [X86,XEN,KVM]
+ nopvspin [X86,XEN,KVM,EARLY]
Disables the qspinlock slow path using PV optimizations
which allow the hypervisor to 'idle' the guest on lock
contention.
@@ -3960,20 +4001,20 @@
This is required for the Braillex ib80-piezo Braille
reader made by F.H. Papenmeier (Germany).
- nosgx [X86-64,SGX] Disables Intel SGX kernel support.
+ nosgx [X86-64,SGX,EARLY] Disables Intel SGX kernel support.
- nosmap [PPC]
+ nosmap [PPC,EARLY]
Disable SMAP (Supervisor Mode Access Prevention)
even if it is supported by processor.
- nosmep [PPC64s]
+ nosmep [PPC64s,EARLY]
Disable SMEP (Supervisor Mode Execution Prevention)
even if it is supported by processor.
- nosmp [SMP] Tells an SMP kernel to act as a UP kernel,
+ nosmp [SMP,EARLY] Tells an SMP kernel to act as a UP kernel,
and disable the IO APIC. legacy for "maxcpus=0".
- nosmt [KNL,MIPS,PPC,S390] Disable symmetric multithreading (SMT).
+ nosmt [KNL,MIPS,PPC,S390,EARLY] Disable symmetric multithreading (SMT).
Equivalent to smt=1.
[KNL,X86,PPC] Disable symmetric multithreading (SMT).
@@ -3983,22 +4024,23 @@
nosoftlockup [KNL] Disable the soft-lockup detector.
nospec_store_bypass_disable
- [HW] Disable all mitigations for the Speculative Store Bypass vulnerability
+ [HW,EARLY] Disable all mitigations for the Speculative
+ Store Bypass vulnerability
- nospectre_bhb [ARM64] Disable all mitigations for Spectre-BHB (branch
+ nospectre_bhb [ARM64,EARLY] Disable all mitigations for Spectre-BHB (branch
history injection) vulnerability. System may allow data leaks
with this option.
- nospectre_v1 [X86,PPC] Disable mitigations for Spectre Variant 1
+ nospectre_v1 [X86,PPC,EARLY] Disable mitigations for Spectre Variant 1
(bounds check bypass). With this option data leaks are
possible in the system.
- nospectre_v2 [X86,PPC_E500,ARM64] Disable all mitigations for
- the Spectre variant 2 (indirect branch prediction)
- vulnerability. System may allow data leaks with this
- option.
+ nospectre_v2 [X86,PPC_E500,ARM64,EARLY] Disable all mitigations
+ for the Spectre variant 2 (indirect branch
+ prediction) vulnerability. System may allow data
+ leaks with this option.
- no-steal-acc [X86,PV_OPS,ARM64,PPC/PSERIES,RISCV] Disable
+ no-steal-acc [X86,PV_OPS,ARM64,PPC/PSERIES,RISCV,EARLY] Disable
paravirtualized steal time accounting. steal time is
computed, but won't influence scheduler behaviour
@@ -4008,7 +4050,7 @@
broken timer IRQ sources.
no_uaccess_flush
- [PPC] Don't flush the L1-D cache after accessing user data.
+ [PPC,EARLY] Don't flush the L1-D cache after accessing user data.
novmcoredd [KNL,KDUMP]
Disable device dump. Device dump allows drivers to
@@ -4022,15 +4064,15 @@
is set.
no-vmw-sched-clock
- [X86,PV_OPS] Disable paravirtualized VMware scheduler
- clock and use the default one.
+ [X86,PV_OPS,EARLY] Disable paravirtualized VMware
+ scheduler clock and use the default one.
nowatchdog [KNL] Disable both lockup detectors, i.e.
soft-lockup and NMI watchdog (hard-lockup).
- nowb [ARM]
+ nowb [ARM,EARLY]
- nox2apic [X86-64,APIC] Do not enable x2APIC mode.
+ nox2apic [X86-64,APIC,EARLY] Do not enable x2APIC mode.
NOTE: this parameter will be ignored on systems with the
LEGACY_XAPIC_DISABLED bit set in the
@@ -4068,7 +4110,7 @@
purges which is reported from either PAL_VM_SUMMARY or
SAL PALO.
- nr_cpus= [SMP] Maximum number of processors that an SMP kernel
+ nr_cpus= [SMP,EARLY] Maximum number of processors that an SMP kernel
could support. nr_cpus=n : n >= 1 limits the kernel to
support 'n' processors. It could be larger than the
number of already plugged CPU during bootup, later in
@@ -4079,8 +4121,9 @@
nr_uarts= [SERIAL] maximum number of UARTs to be registered.
- numa=off [KNL, ARM64, PPC, RISCV, SPARC, X86] Disable NUMA, Only
- set up a single NUMA node spanning all memory.
+ numa=off [KNL, ARM64, PPC, RISCV, SPARC, X86, EARLY]
+ Disable NUMA, Only set up a single NUMA node
+ spanning all memory.
numa_balancing= [KNL,ARM64,PPC,RISCV,S390,X86] Enable or disable automatic
NUMA balancing.
@@ -4091,7 +4134,7 @@
This can be set from sysctl after boot.
See Documentation/admin-guide/sysctl/vm.rst for details.
- ohci1394_dma=early [HW] enable debugging via the ohci1394 driver.
+ ohci1394_dma=early [HW,EARLY] enable debugging via the ohci1394 driver.
See Documentation/core-api/debugging-via-ohci1394.rst for more
info.
@@ -4117,7 +4160,8 @@
Once locked, the boundary cannot be changed.
1 indicates lock status, 0 indicates unlock status.
- oops=panic Always panic on oopses. Default is to just kill the
+ oops=panic [KNL,EARLY]
+ Always panic on oopses. Default is to just kill the
process, but there is a small probability of
deadlocking the machine.
This will also cause panics on machine check exceptions.
@@ -4133,13 +4177,13 @@
can be read from sysfs at:
/sys/module/page_alloc/parameters/shuffle.
- page_owner= [KNL] Boot-time page_owner enabling option.
+ page_owner= [KNL,EARLY] Boot-time page_owner enabling option.
Storage of the information about who allocated
each page is disabled in default. With this switch,
we can turn it on.
on: enable the feature
- page_poison= [KNL] Boot-time parameter changing the state of
+ page_poison= [KNL,EARLY] Boot-time parameter changing the state of
poisoning on the buddy allocator, available with
CONFIG_PAGE_POISONING=y.
off: turn off poisoning (default)
@@ -4157,7 +4201,8 @@
timeout < 0: reboot immediately
Format: <timeout>
- panic_on_taint= Bitmask for conditionally calling panic() in add_taint()
+ panic_on_taint= [KNL,EARLY]
+ Bitmask for conditionally calling panic() in add_taint()
Format: <hex>[,nousertaint]
Hexadecimal bitmask representing the set of TAINT flags
that will cause the kernel to panic when add_taint() is
@@ -4182,6 +4227,7 @@
bit 4: print ftrace buffer
bit 5: print all printk messages in buffer
bit 6: print all CPUs backtrace (if available in the arch)
+ bit 7: print only tasks in uninterruptible (blocked) state
*Be aware* that this option may print a _lot_ of lines,
so there are risks of losing older messages in the log.
Use this option carefully, maybe worth to setup a
@@ -4313,7 +4359,7 @@
pcbit= [HW,ISDN]
- pci=option[,option...] [PCI] various PCI subsystem options.
+ pci=option[,option...] [PCI,EARLY] various PCI subsystem options.
Some options herein operate on a specific device
or a set of devices (<pci_dev>). These are
@@ -4582,7 +4628,8 @@
Format: { 0 | 1 }
See arch/parisc/kernel/pdc_chassis.c
- percpu_alloc= Select which percpu first chunk allocator to use.
+ percpu_alloc= [MM,EARLY]
+ Select which percpu first chunk allocator to use.
Currently supported values are "embed" and "page".
Archs may support subset or none of the selections.
See comments in mm/percpu.c for details on each
@@ -4644,6 +4691,11 @@
may be specified.
Format: <port>,<port>....
+ possible_cpus= [SMP,S390,X86]
+ Format: <unsigned int>
+ Set the number of possible CPUs, overriding the
+ regular discovery mechanisms (such as ACPI/FW, etc).
+
powersave=off [PPC] This option disables power saving features.
It specifically disables cpuidle and sets the
platform machine description specific power_save
@@ -4651,12 +4703,12 @@
execution priority.
ppc_strict_facility_enable
- [PPC] This option catches any kernel floating point,
+ [PPC,ENABLE] This option catches any kernel floating point,
Altivec, VSX and SPE outside of regions specifically
allowed (eg kernel_enable_fpu()/kernel_disable_fpu()).
There is some performance impact when enabling this.
- ppc_tm= [PPC]
+ ppc_tm= [PPC,EARLY]
Format: {"off"}
Disable Hardware Transactional Memory
@@ -4766,7 +4818,7 @@
[KNL] Number of legacy pty's. Overwrites compiled-in
default number.
- quiet [KNL] Disable most log messages
+ quiet [KNL,EARLY] Disable most log messages
r128= [HW,DRM]
@@ -4783,17 +4835,17 @@
ramdisk_start= [RAM] RAM disk image start address
random.trust_cpu=off
- [KNL] Disable trusting the use of the CPU's
+ [KNL,EARLY] Disable trusting the use of the CPU's
random number generator (if available) to
initialize the kernel's RNG.
random.trust_bootloader=off
- [KNL] Disable trusting the use of the a seed
+ [KNL,EARLY] Disable trusting the use of the a seed
passed by the bootloader (if available) to
initialize the kernel's RNG.
randomize_kstack_offset=
- [KNL] Enable or disable kernel stack offset
+ [KNL,EARLY] Enable or disable kernel stack offset
randomization, which provides roughly 5 bits of
entropy, frustrating memory corruption attacks
that depend on stack address determinism or
@@ -5034,6 +5086,11 @@
this kernel boot parameter, forcibly setting it
to zero.
+ rcutree.enable_rcu_lazy= [KNL]
+ To save power, batch RCU callbacks and flush after
+ delay, memory pressure or callback list growing too
+ big.
+
rcuscale.gp_async= [KNL]
Measure performance of asynchronous
grace-period primitives such as call_rcu().
@@ -5484,7 +5541,7 @@
Run specified binary instead of /init from the ramdisk,
used for early userspace startup. See initrd.
- rdrand= [X86]
+ rdrand= [X86,EARLY]
force - Override the decision by the kernel to hide the
advertisement of RDRAND support (this affects
certain AMD processors because of buggy BIOS
@@ -5580,7 +5637,7 @@
them. If <base> is less than 0x10000, the region
is assumed to be I/O ports; otherwise it is memory.
- reservetop= [X86-32]
+ reservetop= [X86-32,EARLY]
Format: nn[KMG]
Reserves a hole at the top of the kernel virtual
address space.
@@ -5665,7 +5722,7 @@
[KNL] Disable ring 3 MONITOR/MWAIT feature on supported
CPUs.
- riscv_isa_fallback [RISCV]
+ riscv_isa_fallback [RISCV,EARLY]
When CONFIG_RISCV_ISA_FALLBACK is not enabled, permit
falling back to detecting extension support by parsing
"riscv,isa" property on devicetree systems when the
@@ -5674,13 +5731,14 @@
ro [KNL] Mount root device read-only on boot
- rodata= [KNL]
+ rodata= [KNL,EARLY]
on Mark read-only kernel memory as read-only (default).
off Leave read-only kernel memory writable for debugging.
full Mark read-only kernel memory and aliases as read-only
[arm64]
rockchip.usb_uart
+ [EARLY]
Enable the uart passthrough on the designated usb port
on Rockchip SoCs. When active, the signals of the
debug-uart get routed to the D+ and D- pins of the usb
@@ -5741,7 +5799,7 @@
sa1100ir [NET]
See drivers/net/irda/sa1100_ir.c.
- sched_verbose [KNL] Enables verbose scheduler debug messages.
+ sched_verbose [KNL,EARLY] Enables verbose scheduler debug messages.
schedstats= [KNL,X86] Enable or disable scheduled statistics.
Allowed values are enable and disable. This feature
@@ -5856,7 +5914,7 @@
non-zero "wait" parameter. See weight_single
and weight_many.
- skew_tick= [KNL] Offset the periodic timer tick per cpu to mitigate
+ skew_tick= [KNL,EARLY] Offset the periodic timer tick per cpu to mitigate
xtime_lock contention on larger systems, and/or RCU lock
contention on all systems with CONFIG_MAXSMP set.
Format: { "0" | "1" }
@@ -5895,11 +5953,42 @@
simeth= [IA-64]
simscsi=
- slram= [HW,MTD]
+ slab_debug[=options[,slabs][;[options[,slabs]]...] [MM]
+ Enabling slab_debug allows one to determine the
+ culprit if slab objects become corrupted. Enabling
+ slab_debug can create guard zones around objects and
+ may poison objects when not in use. Also tracks the
+ last alloc / free. For more information see
+ Documentation/mm/slub.rst.
+ (slub_debug legacy name also accepted for now)
+
+ slab_max_order= [MM]
+ Determines the maximum allowed order for slabs.
+ A high setting may cause OOMs due to memory
+ fragmentation. For more information see
+ Documentation/mm/slub.rst.
+ (slub_max_order legacy name also accepted for now)
slab_merge [MM]
Enable merging of slabs with similar size when the
kernel is built without CONFIG_SLAB_MERGE_DEFAULT.
+ (slub_merge legacy name also accepted for now)
+
+ slab_min_objects= [MM]
+ The minimum number of objects per slab. SLUB will
+ increase the slab order up to slab_max_order to
+ generate a sufficiently large slab able to contain
+ the number of objects indicated. The higher the number
+ of objects the smaller the overhead of tracking slabs
+ and the less frequently locks need to be acquired.
+ For more information see Documentation/mm/slub.rst.
+ (slub_min_objects legacy name also accepted for now)
+
+ slab_min_order= [MM]
+ Determines the minimum page order for slabs. Must be
+ lower or equal to slab_max_order. For more information see
+ Documentation/mm/slub.rst.
+ (slub_min_order legacy name also accepted for now)
slab_nomerge [MM]
Disable merging of slabs with similar size. May be
@@ -5913,47 +6002,9 @@
unchanged). Debug options disable merging on their
own.
For more information see Documentation/mm/slub.rst.
+ (slub_nomerge legacy name also accepted for now)
- slab_max_order= [MM, SLAB]
- Determines the maximum allowed order for slabs.
- A high setting may cause OOMs due to memory
- fragmentation. Defaults to 1 for systems with
- more than 32MB of RAM, 0 otherwise.
-
- slub_debug[=options[,slabs][;[options[,slabs]]...] [MM, SLUB]
- Enabling slub_debug allows one to determine the
- culprit if slab objects become corrupted. Enabling
- slub_debug can create guard zones around objects and
- may poison objects when not in use. Also tracks the
- last alloc / free. For more information see
- Documentation/mm/slub.rst.
-
- slub_max_order= [MM, SLUB]
- Determines the maximum allowed order for slabs.
- A high setting may cause OOMs due to memory
- fragmentation. For more information see
- Documentation/mm/slub.rst.
-
- slub_min_objects= [MM, SLUB]
- The minimum number of objects per slab. SLUB will
- increase the slab order up to slub_max_order to
- generate a sufficiently large slab able to contain
- the number of objects indicated. The higher the number
- of objects the smaller the overhead of tracking slabs
- and the less frequently locks need to be acquired.
- For more information see Documentation/mm/slub.rst.
-
- slub_min_order= [MM, SLUB]
- Determines the minimum page order for slabs. Must be
- lower than slub_max_order.
- For more information see Documentation/mm/slub.rst.
-
- slub_merge [MM, SLUB]
- Same with slab_merge.
-
- slub_nomerge [MM, SLUB]
- Same with slab_nomerge. This is supported for legacy.
- See slab_nomerge for more information.
+ slram= [HW,MTD]
smart2= [HW]
Format: <io1>[,<io2>[,...,<io8>]]
@@ -5987,10 +6038,10 @@
1: Fast pin select (default)
2: ATC IRMode
- smt= [KNL,MIPS,S390] Set the maximum number of threads (logical
- CPUs) to use per physical CPU on systems capable of
- symmetric multithreading (SMT). Will be capped to the
- actual hardware limit.
+ smt= [KNL,MIPS,S390,EARLY] Set the maximum number of threads
+ (logical CPUs) to use per physical CPU on systems
+ capable of symmetric multithreading (SMT). Will
+ be capped to the actual hardware limit.
Format: <integer>
Default: -1 (no limit)
@@ -6012,7 +6063,7 @@
sonypi.*= [HW] Sony Programmable I/O Control Device driver
See Documentation/admin-guide/laptops/sonypi.rst
- spectre_v2= [X86] Control mitigation of Spectre variant 2
+ spectre_v2= [X86,EARLY] Control mitigation of Spectre variant 2
(indirect branch speculation) vulnerability.
The default operation protects the kernel from
user space attacks.
@@ -6027,8 +6078,8 @@
Selecting 'on' will, and 'auto' may, choose a
mitigation method at run time according to the
CPU, the available microcode, the setting of the
- CONFIG_RETPOLINE configuration option, and the
- compiler with which the kernel was built.
+ CONFIG_MITIGATION_RETPOLINE configuration option,
+ and the compiler with which the kernel was built.
Selecting 'on' will also enable the mitigation
against user space to user space task attacks.
@@ -6092,7 +6143,7 @@
spectre_v2_user=auto.
spec_rstack_overflow=
- [X86] Control RAS overflow mitigation on AMD Zen CPUs
+ [X86,EARLY] Control RAS overflow mitigation on AMD Zen CPUs
off - Disable mitigation
microcode - Enable microcode mitigation only
@@ -6103,7 +6154,7 @@
(cloud-specific mitigation)
spec_store_bypass_disable=
- [HW] Control Speculative Store Bypass (SSB) Disable mitigation
+ [HW,EARLY] Control Speculative Store Bypass (SSB) Disable mitigation
(Speculative Store Bypass vulnerability)
Certain CPUs are vulnerable to an exploit against a
@@ -6199,7 +6250,7 @@
#DB exception for bus lock is triggered only when
CPL > 0.
- srbds= [X86,INTEL]
+ srbds= [X86,INTEL,EARLY]
Control the Special Register Buffer Data Sampling
(SRBDS) mitigation.
@@ -6286,7 +6337,7 @@
srcutree.convert_to_big must have the 0x10 bit
set for contention-based conversions to occur.
- ssbd= [ARM64,HW]
+ ssbd= [ARM64,HW,EARLY]
Speculative Store Bypass Disable control
On CPUs that are vulnerable to the Speculative
@@ -6310,7 +6361,7 @@
growing up) the main stack are reserved for no other
mapping. Default value is 256 pages.
- stack_depot_disable= [KNL]
+ stack_depot_disable= [KNL,EARLY]
Setting this to true through kernel command line will
disable the stack depot thereby saving the static memory
consumed by the stack hash table. By default this is set
@@ -6349,12 +6400,12 @@
be used to filter out binaries which have
not yet been made aware of AT_MINSIGSTKSZ.
- stress_hpt [PPC]
+ stress_hpt [PPC,EARLY]
Limits the number of kernel HPT entries in the hash
page table to increase the rate of hash page table
faults on kernel addresses.
- stress_slb [PPC]
+ stress_slb [PPC,EARLY]
Limits the number of kernel SLB entries, and flushes
them frequently to increase the rate of SLB faults
on kernel addresses.
@@ -6414,7 +6465,7 @@
This parameter controls use of the Protected
Execution Facility on pSeries.
- swiotlb= [ARM,IA-64,PPC,MIPS,X86]
+ swiotlb= [ARM,IA-64,PPC,MIPS,X86,EARLY]
Format: { <int> [,<int>] | force | noforce }
<int> -- Number of I/O TLB slabs
<int> -- Second integer after comma. Number of swiotlb
@@ -6424,7 +6475,7 @@
wouldn't be automatically used by the kernel
noforce -- Never use bounce buffers (for debugging)
- switches= [HW,M68k]
+ switches= [HW,M68k,EARLY]
sysctl.*= [KNL]
Set a sysctl parameter, right before loading the init
@@ -6483,11 +6534,11 @@
<deci-seconds>: poll all this frequency
0: no polling (default)
- threadirqs [KNL]
+ threadirqs [KNL,EARLY]
Force threading of all interrupt handlers except those
marked explicitly IRQF_NO_THREAD.
- topology= [S390]
+ topology= [S390,EARLY]
Format: {off | on}
Specify if the kernel should make use of the cpu
topology information if the hardware supports this.
@@ -6548,7 +6599,7 @@
To turn off having tracepoints sent to printk,
echo 0 > /proc/sys/kernel/tracepoint_printk
Note, echoing 1 into this file without the
- tracepoint_printk kernel cmdline option has no effect.
+ tp_printk kernel cmdline option has no effect.
The tp_printk_stop_on_boot (see below) can also be used
to stop the printing of events to console at
@@ -6728,7 +6779,7 @@
can be overridden by a later tsc=nowatchdog. A console
message will flag any such suppression or overriding.
- tsc_early_khz= [X86] Skip early TSC calibration and use the given
+ tsc_early_khz= [X86,EARLY] Skip early TSC calibration and use the given
value instead. Useful when the early TSC frequency discovery
procedure is not reliable, such as on overclocked systems
with CPUID.16h support and partial CPUID.15h support.
@@ -6763,7 +6814,7 @@
See Documentation/admin-guide/hw-vuln/tsx_async_abort.rst
for more details.
- tsx_async_abort= [X86,INTEL] Control mitigation for the TSX Async
+ tsx_async_abort= [X86,INTEL,EARLY] Control mitigation for the TSX Async
Abort (TAA) vulnerability.
Similar to Micro-architectural Data Sampling (MDS)
@@ -6829,7 +6880,7 @@
unknown_nmi_panic
[X86] Cause panic on unknown NMI.
- unwind_debug [X86-64]
+ unwind_debug [X86-64,EARLY]
Enable unwinder debug output. This can be
useful for debugging certain unwinder error
conditions, including corrupt stacks and
@@ -7019,7 +7070,7 @@
Example: user_debug=31
userpte=
- [X86] Flags controlling user PTE allocations.
+ [X86,EARLY] Flags controlling user PTE allocations.
nohigh = do not allocate PTE pages in
HIGHMEM regardless of setting
@@ -7048,7 +7099,7 @@
vector= [IA-64,SMP]
vector=percpu: enable percpu vector domain
- video= [FB] Frame buffer configuration
+ video= [FB,EARLY] Frame buffer configuration
See Documentation/fb/modedb.rst.
video.brightness_switch_enabled= [ACPI]
@@ -7096,13 +7147,13 @@
P Enable page structure init time poisoning
- Disable all of the above options
- vmalloc=nn[KMG] [KNL,BOOT] Forces the vmalloc area to have an exact
- size of <nn>. This can be used to increase the
- minimum size (128MB on x86). It can also be used to
- decrease the size and leave more room for directly
- mapped kernel RAM.
+ vmalloc=nn[KMG] [KNL,BOOT,EARLY] Forces the vmalloc area to have an
+ exact size of <nn>. This can be used to increase
+ the minimum size (128MB on x86). It can also be
+ used to decrease the size and leave more room
+ for directly mapped kernel RAM.
- vmcp_cma=nn[MG] [KNL,S390]
+ vmcp_cma=nn[MG] [KNL,S390,EARLY]
Sets the memory size reserved for contiguous memory
allocations for the vmcp device driver.
@@ -7115,7 +7166,7 @@
vmpoff= [KNL,S390] Perform z/VM CP command after power off.
Format: <command>
- vsyscall= [X86-64]
+ vsyscall= [X86-64,EARLY]
Controls the behavior of vsyscalls (i.e. calls to
fixed addresses of 0xffffffffff600x00 from legacy
code). Most statically-linked binaries and older
@@ -7225,6 +7276,15 @@
threshold repeatedly. They are likely good
candidates for using WQ_UNBOUND workqueues instead.
+ workqueue.cpu_intensive_warning_thresh=<uint>
+ If CONFIG_WQ_CPU_INTENSIVE_REPORT is set, the kernel
+ will report the work functions which violate the
+ intensive_threshold_us repeatedly. In order to prevent
+ spurious warnings, start printing only after a work
+ function has violated this threshold number of times.
+
+ The default is 4 times. 0 disables the warning.
+
workqueue.power_efficient
Per-cpu workqueues are generally preferred because
they show better performance thanks to cache
@@ -7263,13 +7323,13 @@
When enabled, memory and cache locality will be
impacted.
- writecombine= [LOONGARCH] Control the MAT (Memory Access Type) of
- ioremap_wc().
+ writecombine= [LOONGARCH,EARLY] Control the MAT (Memory Access
+ Type) of ioremap_wc().
on - Enable writecombine, use WUC for ioremap_wc()
off - Disable writecombine, use SUC for ioremap_wc()
- x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of
+ x2apic_phys [X86-64,APIC,EARLY] Use x2apic physical mode instead of
default x2apic cluster mode on platforms
supporting x2apic.
@@ -7280,7 +7340,7 @@
save/restore/migration must be enabled to handle larger
domains.
- xen_emul_unplug= [HW,X86,XEN]
+ xen_emul_unplug= [HW,X86,XEN,EARLY]
Unplug Xen emulated devices
Format: [unplug0,][unplug1]
ide-disks -- unplug primary master IDE devices
@@ -7292,17 +7352,17 @@
the unplug protocol
never -- do not unplug even if version check succeeds
- xen_legacy_crash [X86,XEN]
+ xen_legacy_crash [X86,XEN,EARLY]
Crash from Xen panic notifier, without executing late
panic() code such as dumping handler.
- xen_msr_safe= [X86,XEN]
+ xen_msr_safe= [X86,XEN,EARLY]
Format: <bool>
Select whether to always use non-faulting (safe) MSR
access functions when running as Xen PV guest. The
default value is controlled by CONFIG_XEN_PV_MSR_SAFE.
- xen_nopvspin [X86,XEN]
+ xen_nopvspin [X86,XEN,EARLY]
Disables the qspinlock slowpath using Xen PV optimizations.
This parameter is obsoleted by "nopvspin" parameter, which
has equivalent effect for XEN platform.
@@ -7314,7 +7374,7 @@
has equivalent effect for XEN platform.
xen_no_vector_callback
- [KNL,X86,XEN] Disable the vector callback for Xen
+ [KNL,X86,XEN,EARLY] Disable the vector callback for Xen
event channel interrupts.
xen_scrub_pages= [XEN]
@@ -7323,7 +7383,7 @@
with /sys/devices/system/xen_memory/xen_memory0/scrub_pages.
Default value controlled with CONFIG_XEN_SCRUB_PAGES_DEFAULT.
- xen_timer_slop= [X86-64,XEN]
+ xen_timer_slop= [X86-64,XEN,EARLY]
Set the timer slop (in nanoseconds) for the virtual Xen
timers (default is 100000). This adjusts the minimum
delta of virtualized Xen timers, where lower values
@@ -7376,7 +7436,7 @@
host controller quirks. Meaning of each bit can be
consulted in header drivers/usb/host/xhci.h.
- xmon [PPC]
+ xmon [PPC,EARLY]
Format: { early | on | rw | ro | off }
Controls if xmon debugger is enabled. Default is off.
Passing only "xmon" is equivalent to "xmon=early".
diff --git a/Documentation/admin-guide/laptops/thinkpad-acpi.rst b/Documentation/admin-guide/laptops/thinkpad-acpi.rst
index 98d3040..7f674a6 100644
--- a/Documentation/admin-guide/laptops/thinkpad-acpi.rst
+++ b/Documentation/admin-guide/laptops/thinkpad-acpi.rst
@@ -444,7 +444,9 @@
0x1008 0x07 FN+F8 IBM: toggle screen expand
Lenovo: configure UltraNav,
- or toggle screen expand
+ or toggle screen expand.
+ On newer platforms (2024+)
+ replaced by 0x131f (see below)
0x1009 0x08 FN+F9 -
@@ -504,6 +506,9 @@
0x1019 0x18 unknown
+0x131f ... FN+F8 Platform Mode change.
+ Implemented in driver.
+
... ... ...
0x1020 0x1F unknown
diff --git a/Documentation/admin-guide/media/visl.rst b/Documentation/admin-guide/media/visl.rst
index db1ef29..cd45145 100644
--- a/Documentation/admin-guide/media/visl.rst
+++ b/Documentation/admin-guide/media/visl.rst
@@ -49,6 +49,10 @@
visl_dprintk_frame_start, visl_dprintk_nframes, but controls the dumping of
buffer data through debugfs instead.
+- tpg_verbose: Write extra information on each output frame to ease debugging
+ the API. When set to true, the output frames are not stable for a given input
+ as some information like pointers or queue status will be added to them.
+
What is the default use case for this driver?
---------------------------------------------
@@ -57,8 +61,12 @@
OUTPUT buffer data is subsequently used to debug a work-in-progress
implementation.
-Information on reference frames, their timestamps, the status of the OUTPUT and
-CAPTURE queues and more can be read directly from the CAPTURE buffers.
+Even though no video decoding is actually done, the output frames can be used
+against a reference for a given input, except if tpg_verbose is set to true.
+
+Depending on the tpg_verbose parameter value, information on reference frames,
+their timestamps, the status of the OUTPUT and CAPTURE queues and more can be
+read directly from the CAPTURE buffers.
Supported codecs
----------------
diff --git a/Documentation/admin-guide/media/vivid.rst b/Documentation/admin-guide/media/vivid.rst
index 58ac25b..b6f658c 100644
--- a/Documentation/admin-guide/media/vivid.rst
+++ b/Documentation/admin-guide/media/vivid.rst
@@ -60,7 +60,7 @@
- node_types:
which devices should each driver instance create. An array of
- hexadecimal values, one for each instance. The default is 0x1d3d.
+ hexadecimal values, one for each instance. The default is 0xe1d3d.
Each value is a bitmask with the following meaning:
- bit 0: Video Capture node
diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst
index 343e25b..af05ae6 100644
--- a/Documentation/admin-guide/mm/damon/reclaim.rst
+++ b/Documentation/admin-guide/mm/damon/reclaim.rst
@@ -117,6 +117,33 @@
1 second by default.
+quota_mem_pressure_us
+---------------------
+
+Desired level of memory pressure-stall time in microseconds.
+
+While keeping the caps that set by other quotas, DAMON_RECLAIM automatically
+increases and decreases the effective level of the quota aiming this level of
+memory pressure is incurred. System-wide ``some`` memory PSI in microseconds
+per quota reset interval (``quota_reset_interval_ms``) is collected and
+compared to this value to see if the aim is satisfied. Value zero means
+disabling this auto-tuning feature.
+
+Disabled by default.
+
+quota_autotune_feedback
+-----------------------
+
+User-specifiable feedback for auto-tuning of the effective quota.
+
+While keeping the caps that set by other quotas, DAMON_RECLAIM automatically
+increases and decreases the effective level of the quota aiming receiving this
+feedback of value ``10,000`` from the user. DAMON_RECLAIM assumes the feedback
+value and the quota are positively proportional. Value zero means disabling
+this auto-tuning feature.
+
+Disabled by default.
+
wmarks_interval
---------------
diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index 9d23144..6fce035 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -83,10 +83,10 @@
│ │ │ │ │ │ │ │ sz/min,max
│ │ │ │ │ │ │ │ nr_accesses/min,max
│ │ │ │ │ │ │ │ age/min,max
- │ │ │ │ │ │ │ :ref:`quotas <sysfs_quotas>`/ms,bytes,reset_interval_ms
+ │ │ │ │ │ │ │ :ref:`quotas <sysfs_quotas>`/ms,bytes,reset_interval_ms,effective_bytes
│ │ │ │ │ │ │ │ weights/sz_permil,nr_accesses_permil,age_permil
│ │ │ │ │ │ │ │ :ref:`goals <sysfs_schemes_quota_goals>`/nr_goals
- │ │ │ │ │ │ │ │ │ 0/target_value,current_value
+ │ │ │ │ │ │ │ │ │ 0/target_metric,target_value,current_value
│ │ │ │ │ │ │ :ref:`watermarks <sysfs_watermarks>`/metric,interval_us,high,mid,low
│ │ │ │ │ │ │ :ref:`filters <sysfs_filters>`/nr_filters
│ │ │ │ │ │ │ │ 0/type,matching,memcg_id
@@ -153,6 +153,9 @@
- ``clear_schemes_tried_regions``: Clear the DAMON-based operating scheme
action tried regions directory for each DAMON-based operation scheme of the
kdamond.
+- ``update_schemes_effective_bytes``: Update the contents of
+ ``effective_bytes`` files for each DAMON-based operation scheme of the
+ kdamond. For more details, refer to :ref:`quotas directory <sysfs_quotas>`.
If the state is ``on``, reading ``pid`` shows the pid of the kdamond thread.
@@ -180,19 +183,14 @@
and three directories (``monitoring_attrs``, ``targets``, and ``schemes``)
exist.
-DAMON supports multiple types of monitoring operations, including those for
-virtual address space and the physical address space. You can get the list of
-available monitoring operations set on the currently running kernel by reading
+DAMON supports multiple types of :ref:`monitoring operations
+<damon_design_configurable_operations_set>`, including those for virtual address
+space and the physical address space. You can get the list of available
+monitoring operations set on the currently running kernel by reading
``avail_operations`` file. Based on the kernel configuration, the file will
-list some or all of below keywords.
-
- - vaddr: Monitor virtual address spaces of specific processes
- - fvaddr: Monitor fixed virtual address ranges
- - paddr: Monitor the physical address space of the system
-
-Please refer to :ref:`regions sysfs directory <sysfs_regions>` for detailed
-differences between the operations sets in terms of the monitoring target
-regions.
+list different available operation sets. Please refer to the :ref:`design
+<damon_operations_set>` for the list of all available operation sets and their
+brief explanations.
You can set and get what type of monitoring operations DAMON will use for the
context by writing one of the keywords listed in ``avail_operations`` file and
@@ -247,17 +245,11 @@
targets/<N>/regions
-------------------
-When ``vaddr`` monitoring operations set is being used (``vaddr`` is written to
-the ``contexts/<N>/operations`` file), DAMON automatically sets and updates the
-monitoring target regions so that entire memory mappings of target processes
-can be covered. However, users could want to set the initial monitoring region
-to specific address ranges.
-
-In contrast, DAMON do not automatically sets and updates the monitoring target
-regions when ``fvaddr`` or ``paddr`` monitoring operations sets are being used
-(``fvaddr`` or ``paddr`` have written to the ``contexts/<N>/operations``).
-Therefore, users should set the monitoring target regions by themselves in the
-cases.
+In case of ``fvaddr`` or ``paddr`` monitoring operations sets, users are
+required to set the monitoring target address ranges. In case of ``vaddr``
+operations set, it is not mandatory, but users can optionally set the initial
+monitoring region to specific address ranges. Please refer to the :ref:`design
+<damon_design_vaddr_target_regions_construction>` for more details.
For such cases, users can explicitly set the initial monitoring target regions
as they want, by writing proper values to the files under this directory.
@@ -302,27 +294,8 @@
The ``action`` file is for setting and getting the scheme's :ref:`action
<damon_design_damos_action>`. The keywords that can be written to and read
-from the file and their meaning are as below.
-
-Note that support of each action depends on the running DAMON operations set
-:ref:`implementation <sysfs_context>`.
-
- - ``willneed``: Call ``madvise()`` for the region with ``MADV_WILLNEED``.
- Supported by ``vaddr`` and ``fvaddr`` operations set.
- - ``cold``: Call ``madvise()`` for the region with ``MADV_COLD``.
- Supported by ``vaddr`` and ``fvaddr`` operations set.
- - ``pageout``: Call ``madvise()`` for the region with ``MADV_PAGEOUT``.
- Supported by ``vaddr``, ``fvaddr`` and ``paddr`` operations set.
- - ``hugepage``: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``.
- Supported by ``vaddr`` and ``fvaddr`` operations set.
- - ``nohugepage``: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``.
- Supported by ``vaddr`` and ``fvaddr`` operations set.
- - ``lru_prio``: Prioritize the region on its LRU lists.
- Supported by ``paddr`` operations set.
- - ``lru_deprio``: Deprioritize the region on its LRU lists.
- Supported by ``paddr`` operations set.
- - ``stat``: Do nothing but count the statistics.
- Supported by all operations sets.
+from the file and their meaning are same to those of the list on
+:ref:`design doc <damon_design_damos_action>`.
The ``apply_interval_us`` file is for setting and getting the scheme's
:ref:`apply_interval <damon_design_damos>` in microseconds.
@@ -350,8 +323,9 @@
The directory for the :ref:`quotas <damon_design_damos_quotas>` of the given
DAMON-based operation scheme.
-Under ``quotas`` directory, three files (``ms``, ``bytes``,
-``reset_interval_ms``) and two directores (``weights`` and ``goals``) exist.
+Under ``quotas`` directory, four files (``ms``, ``bytes``,
+``reset_interval_ms``, ``effective_bytes``) and two directores (``weights`` and
+``goals``) exist.
You can set the ``time quota`` in milliseconds, ``size quota`` in bytes, and
``reset interval`` in milliseconds by writing the values to the three files,
@@ -359,7 +333,17 @@
for applying the ``action`` to memory regions of the ``access_pattern``, and to
apply the action to only up to ``bytes`` bytes of memory regions within the
``reset_interval_ms``. Setting both ``ms`` and ``bytes`` zero disables the
-quota limits.
+quota limits unless at least one :ref:`goal <sysfs_schemes_quota_goals>` is
+set.
+
+The time quota is internally transformed to a size quota. Between the
+transformed size quota and user-specified size quota, smaller one is applied.
+Based on the user-specified :ref:`goal <sysfs_schemes_quota_goals>`, the
+effective size quota is further adjusted. Reading ``effective_bytes`` returns
+the current effective size quota. The file is not updated in real time, so
+users should ask DAMON sysfs interface to update the content of the file for
+the stats by writing a special keyword, ``update_schemes_effective_bytes`` to
+the relevant ``kdamonds/<N>/state`` file.
Under ``weights`` directory, three files (``sz_permil``,
``nr_accesses_permil``, and ``age_permil``) exist.
@@ -382,11 +366,11 @@
to ``N-1``. Each directory represents each goal and current achievement.
Among the multiple feedback, the best one is used.
-Each goal directory contains two files, namely ``target_value`` and
-``current_value``. Users can set and get any number to those files to set the
-feedback. User space main workload's latency or throughput, system metrics
-like free memory ratio or memory pressure stall time (PSI) could be example
-metrics for the values. Note that users should write
+Each goal directory contains three files, namely ``target_metric``,
+``target_value`` and ``current_value``. Users can set and get the three
+parameters for the quota auto-tuning goals that specified on the :ref:`design
+doc <damon_design_damos_quotas_auto_tuning>` by writing to and reading from each
+of the files. Note that users should further write
``commit_schemes_quota_goals`` to the ``state`` file of the :ref:`kdamond
directory <sysfs_kdamond>` to pass the feedback to DAMON.
@@ -579,11 +563,11 @@
While the monitoring is turned on, you could record the tracepoint events and
show results using tracepoint supporting tools like ``perf``. For example::
- # echo on > monitor_on
+ # echo on > kdamonds/0/state
# perf record -e damon:damon_aggregated &
# sleep 5
# kill 9 $(pidof perf)
- # echo off > monitor_on
+ # echo off > kdamonds/0/state
# perf script
kdamond.0 46568 [027] 79357.842179: damon:damon_aggregated: target_id=0 nr_regions=11 122509119488-135708762112: 0 864
[...]
@@ -628,9 +612,17 @@
move, please report your usecase to damon@lists.linux.dev and
linux-mm@kvack.org.
-DAMON exports eight files, ``attrs``, ``target_ids``, ``init_regions``,
-``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` and
-``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.
+DAMON exports nine files, ``DEPRECATED``, ``attrs``, ``target_ids``,
+``init_regions``, ``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``,
+``mk_contexts`` and ``rm_contexts`` under its debugfs directory,
+``<debugfs>/damon/``.
+
+
+``DEPRECATED`` is a read-only file for the DAMON debugfs interface deprecation
+notice. Reading it returns the deprecation notice, as below::
+
+ # cat DEPRECATED
+ DAMON debugfs interface is deprecated, so users should move to DAMON_SYSFS. If you cannot, please report your usecase to damon@lists.linux.dev and linux-mm@kvack.org.
Attributes
@@ -755,19 +747,17 @@
~~~~~~
The ``<action>`` is a predefined integer for memory management :ref:`actions
-<damon_design_damos_action>`. The supported numbers and their meanings are as
-below.
+<damon_design_damos_action>`. The mapping between the ``<action>`` values and
+the memory management actions is as below. For the detailed meaning of the
+action and DAMON operations set supporting each action, please refer to the
+list on :ref:`design doc <damon_design_damos_action>`.
- - 0: Call ``madvise()`` for the region with ``MADV_WILLNEED``. Ignored if
- ``target`` is ``paddr``.
- - 1: Call ``madvise()`` for the region with ``MADV_COLD``. Ignored if
- ``target`` is ``paddr``.
- - 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT``.
- - 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``. Ignored if
- ``target`` is ``paddr``.
- - 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``. Ignored if
- ``target`` is ``paddr``.
- - 5: Do nothing but count the statistics
+ - 0: ``willneed``
+ - 1: ``cold``
+ - 2: ``pageout``
+ - 3: ``hugepage``
+ - 4: ``nohugepage``
+ - 5: ``stat``
Quota
~~~~~
@@ -848,16 +838,16 @@
Setting the files as described above doesn't incur effect unless you explicitly
start the monitoring. You can start, stop, and check the current status of the
-monitoring by writing to and reading from the ``monitor_on`` file. Writing
-``on`` to the file starts the monitoring of the targets with the attributes.
-Writing ``off`` to the file stops those. DAMON also stops if every target
-process is terminated. Below example commands turn on, off, and check the
-status of DAMON::
+monitoring by writing to and reading from the ``monitor_on_DEPRECATED`` file.
+Writing ``on`` to the file starts the monitoring of the targets with the
+attributes. Writing ``off`` to the file stops those. DAMON also stops if
+every target process is terminated. Below example commands turn on, off, and
+check the status of DAMON::
# cd <debugfs>/damon
- # echo on > monitor_on
- # echo off > monitor_on
- # cat monitor_on
+ # echo on > monitor_on_DEPRECATED
+ # echo off > monitor_on_DEPRECATED
+ # cat monitor_on_DEPRECATED
off
Please note that you cannot write to the above-mentioned debugfs files while
@@ -873,11 +863,11 @@
monitoring is turned off, reading the file returns ``none``. ::
# cd <debugfs>/damon
- # cat monitor_on
+ # cat monitor_on_DEPRECATED
off
# cat kdamond_pid
none
- # echo on > monitor_on
+ # echo on > monitor_on_DEPRECATED
# cat kdamond_pid
18594
@@ -907,5 +897,5 @@
# ls foo
# ls: cannot access 'foo': No such file or directory
-Note that ``mk_contexts``, ``rm_contexts``, and ``monitor_on`` files are in the
-root directory only.
+Note that ``mk_contexts``, ``rm_contexts``, and ``monitor_on_DEPRECATED`` files
+are in the root directory only.
diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst
index eca38fa..a70f20c 100644
--- a/Documentation/admin-guide/mm/numa_memory_policy.rst
+++ b/Documentation/admin-guide/mm/numa_memory_policy.rst
@@ -250,6 +250,15 @@
can fall back to all existing numa nodes. This is effectively
MPOL_PREFERRED allowed for a mask rather than a single node.
+MPOL_WEIGHTED_INTERLEAVE
+ This mode operates the same as MPOL_INTERLEAVE, except that
+ interleaving behavior is executed based on weights set in
+ /sys/kernel/mm/mempolicy/weighted_interleave/
+
+ Weighted interleave allocates pages on nodes according to a
+ weight. For example if nodes [0,1] are weighted [5,2], 5 pages
+ will be allocated on node0 for every 2 pages allocated on node1.
+
NUMA memory policy supports the following optional mode flags:
MPOL_F_STATIC_NODES
diff --git a/Documentation/admin-guide/mm/zswap.rst b/Documentation/admin-guide/mm/zswap.rst
index b4213296..1363267 100644
--- a/Documentation/admin-guide/mm/zswap.rst
+++ b/Documentation/admin-guide/mm/zswap.rst
@@ -155,7 +155,7 @@
Some users cannot tolerate the swapping that comes with zswap store failures
and zswap writebacks. Swapping can be disabled entirely (without disabling
-zswap itself) on a cgroup-basis as follows:
+zswap itself) on a cgroup-basis as follows::
echo 0 > /sys/fs/cgroup/<cgroup-name>/memory.zswap.writeback
@@ -166,7 +166,7 @@
When there is a sizable amount of cold memory residing in the zswap pool, it
can be advantageous to proactively write these cold pages to swap and reclaim
the memory for other use cases. By default, the zswap shrinker is disabled.
-User can enable it as follows:
+User can enable it as follows::
echo Y > /sys/module/zswap/parameters/shrinker_enabled
diff --git a/Documentation/admin-guide/perf/hisi-pcie-pmu.rst b/Documentation/admin-guide/perf/hisi-pcie-pmu.rst
index 7e86366..5541ff4 100644
--- a/Documentation/admin-guide/perf/hisi-pcie-pmu.rst
+++ b/Documentation/admin-guide/perf/hisi-pcie-pmu.rst
@@ -37,9 +37,21 @@
hisi_pcie0_core0/rx_mwr_cnt/ [kernel PMU event]
------------------------------------------
- $# perf stat -e hisi_pcie0_core0/rx_mwr_latency/
- $# perf stat -e hisi_pcie0_core0/rx_mwr_cnt/
- $# perf stat -g -e hisi_pcie0_core0/rx_mwr_latency/ -e hisi_pcie0_core0/rx_mwr_cnt/
+ $# perf stat -e hisi_pcie0_core0/rx_mwr_latency,port=0xffff/
+ $# perf stat -e hisi_pcie0_core0/rx_mwr_cnt,port=0xffff/
+
+The related events usually used to calculate the bandwidth, latency or others.
+They need to start and end counting at the same time, therefore related events
+are best used in the same event group to get the expected value. There are two
+ways to know if they are related events:
+
+a) By event name, such as the latency events "xxx_latency, xxx_cnt" or
+ bandwidth events "xxx_flux, xxx_time".
+b) By event type, such as "event=0xXXXX, event=0x1XXXX".
+
+Example usage of perf group::
+
+ $# perf stat -e "{hisi_pcie0_core0/rx_mwr_latency,port=0xffff/,hisi_pcie0_core0/rx_mwr_cnt,port=0xffff/}"
The current driver does not support sampling. So "perf record" is unsupported.
Also attach to a task is unsupported for PCIe PMU.
@@ -51,8 +63,12 @@
PMU could only monitor the performance of traffic downstream target Root
Ports or downstream target Endpoint. PCIe PMU driver support "port" and
- "bdf" interfaces for users, and these two interfaces aren't supported at the
- same time.
+ "bdf" interfaces for users.
+ Please notice that, one of these two interfaces must be set, and these two
+ interfaces aren't supported at the same time. If they are both set, only
+ "port" filter is valid.
+ If "port" filter not being set or is set explicitly to zero (default), the
+ "bdf" filter will be in effect, because "bdf=0" meaning 0000:000:00.0.
- port
@@ -95,7 +111,7 @@
Example usage of perf::
- $# perf stat -e hisi_pcie0_core0/rx_mrd_flux,trig_len=0x4,trig_mode=1/ sleep 5
+ $# perf stat -e hisi_pcie0_core0/rx_mrd_flux,port=0xffff,trig_len=0x4,trig_mode=1/ sleep 5
3. Threshold filter
@@ -109,7 +125,7 @@
Example usage of perf::
- $# perf stat -e hisi_pcie0_core0/rx_mrd_flux,thr_len=0x4,thr_mode=1/ sleep 5
+ $# perf stat -e hisi_pcie0_core0/rx_mrd_flux,port=0xffff,thr_len=0x4,thr_mode=1/ sleep 5
4. TLP Length filter
@@ -127,4 +143,4 @@
Example usage of perf::
- $# perf stat -e hisi_pcie0_core0/rx_mrd_flux,len_mode=0x1/ sleep 5
+ $# perf stat -e hisi_pcie0_core0/rx_mrd_flux,port=0xffff,len_mode=0x1/ sleep 5
diff --git a/Documentation/admin-guide/perf/index.rst b/Documentation/admin-guide/perf/index.rst
index f4a4513..7eb3dcd 100644
--- a/Documentation/admin-guide/perf/index.rst
+++ b/Documentation/admin-guide/perf/index.rst
@@ -13,6 +13,7 @@
imx-ddr
qcom_l2_pmu
qcom_l3_pmu
+ starfive_starlink_pmu
arm-ccn
arm-cmn
xgene-pmu
diff --git a/Documentation/admin-guide/perf/starfive_starlink_pmu.rst b/Documentation/admin-guide/perf/starfive_starlink_pmu.rst
new file mode 100644
index 0000000..2932ddb
--- /dev/null
+++ b/Documentation/admin-guide/perf/starfive_starlink_pmu.rst
@@ -0,0 +1,46 @@
+================================================
+StarFive StarLink Performance Monitor Unit (PMU)
+================================================
+
+StarFive StarLink Performance Monitor Unit (PMU) exists within the
+StarLink Coherent Network on Chip (CNoC) that connects multiple CPU
+clusters with an L3 memory system.
+
+The uncore PMU supports overflow interrupt, up to 16 programmable 64bit
+event counters, and an independent 64bit cycle counter.
+The PMU can only be accessed via Memory Mapped I/O and are common to the
+cores connected to the same PMU.
+
+Driver exposes supported PMU events in sysfs "events" directory under::
+
+ /sys/bus/event_source/devices/starfive_starlink_pmu/events/
+
+Driver exposes cpu used to handle PMU events in sysfs "cpumask" directory
+under::
+
+ /sys/bus/event_source/devices/starfive_starlink_pmu/cpumask/
+
+Driver describes the format of config (event ID) in sysfs "format" directory
+under::
+
+ /sys/bus/event_source/devices/starfive_starlink_pmu/format/
+
+Example of perf usage::
+
+ $ perf list
+
+ starfive_starlink_pmu/cycles/ [Kernel PMU event]
+ starfive_starlink_pmu/read_hit/ [Kernel PMU event]
+ starfive_starlink_pmu/read_miss/ [Kernel PMU event]
+ starfive_starlink_pmu/read_request/ [Kernel PMU event]
+ starfive_starlink_pmu/release_request/ [Kernel PMU event]
+ starfive_starlink_pmu/write_hit/ [Kernel PMU event]
+ starfive_starlink_pmu/write_miss/ [Kernel PMU event]
+ starfive_starlink_pmu/write_request/ [Kernel PMU event]
+ starfive_starlink_pmu/writeback/ [Kernel PMU event]
+
+
+ $ perf stat -a -e /starfive_starlink_pmu/cycles/ sleep 1
+
+Sampling is not supported. As a result, "perf record" is not supported.
+Attaching to a task is not supported, only system-wide counting is supported.
diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst
index 9eb2601..1e0d101 100644
--- a/Documentation/admin-guide/pm/amd-pstate.rst
+++ b/Documentation/admin-guide/pm/amd-pstate.rst
@@ -300,8 +300,8 @@
efficiency frequency management method on AMD processors.
-AMD Pstate Driver Operation Modes
-=================================
+``amd-pstate`` Driver Operation Modes
+======================================
``amd_pstate`` CPPC has 3 operation modes: autonomous (active) mode,
non-autonomous (passive) mode and guided autonomous (guided) mode.
@@ -353,6 +353,48 @@
level and the platform autonomously selects a performance level in this range
and appropriate to the current workload.
+``amd-pstate`` Preferred Core
+=================================
+
+The core frequency is subjected to the process variation in semiconductors.
+Not all cores are able to reach the maximum frequency respecting the
+infrastructure limits. Consequently, AMD has redefined the concept of
+maximum frequency of a part. This means that a fraction of cores can reach
+maximum frequency. To find the best process scheduling policy for a given
+scenario, OS needs to know the core ordering informed by the platform through
+highest performance capability register of the CPPC interface.
+
+``amd-pstate`` preferred core enables the scheduler to prefer scheduling on
+cores that can achieve a higher frequency with lower voltage. The preferred
+core rankings can dynamically change based on the workload, platform conditions,
+thermals and ageing.
+
+The priority metric will be initialized by the ``amd-pstate`` driver. The ``amd-pstate``
+driver will also determine whether or not ``amd-pstate`` preferred core is
+supported by the platform.
+
+``amd-pstate`` driver will provide an initial core ordering when the system boots.
+The platform uses the CPPC interfaces to communicate the core ranking to the
+operating system and scheduler to make sure that OS is choosing the cores
+with highest performance firstly for scheduling the process. When ``amd-pstate``
+driver receives a message with the highest performance change, it will
+update the core ranking and set the cpu's priority.
+
+``amd-pstate`` Preferred Core Switch
+=====================================
+Kernel Parameters
+-----------------
+
+``amd-pstate`` peferred core`` has two states: enable and disable.
+Enable/disable states can be chosen by different kernel parameters.
+Default enable ``amd-pstate`` preferred core.
+
+``amd_prefcore=disable``
+
+For systems that support ``amd-pstate`` preferred core, the core rankings will
+always be advertised by the platform. But OS can choose to ignore that via the
+kernel parameter ``amd_prefcore=disable``.
+
User Space Interface in ``sysfs`` - General
===========================================
@@ -385,6 +427,19 @@
to the operation mode represented by that string - or to be
unregistered in the "disable" case.
+``prefcore``
+ Preferred core state of the driver: "enabled" or "disabled".
+
+ "enabled"
+ Enable the ``amd-pstate`` preferred core.
+
+ "disabled"
+ Disable the ``amd-pstate`` preferred core
+
+
+ This attribute is read-only to check the state of preferred core set
+ by the kernel parameter.
+
``cpupower`` tool support for ``amd-pstate``
===============================================
diff --git a/Documentation/admin-guide/ras.rst b/Documentation/admin-guide/ras.rst
deleted file mode 100644
index 8e03751..0000000
--- a/Documentation/admin-guide/ras.rst
+++ /dev/null
@@ -1,1219 +0,0 @@
-.. include:: <isonum.txt>
-
-============================================
-Reliability, Availability and Serviceability
-============================================
-
-RAS concepts
-************
-
-Reliability, Availability and Serviceability (RAS) is a concept used on
-servers meant to measure their robustness.
-
-Reliability
- is the probability that a system will produce correct outputs.
-
- * Generally measured as Mean Time Between Failures (MTBF)
- * Enhanced by features that help to avoid, detect and repair hardware faults
-
-Availability
- is the probability that a system is operational at a given time
-
- * Generally measured as a percentage of downtime per a period of time
- * Often uses mechanisms to detect and correct hardware faults in
- runtime;
-
-Serviceability (or maintainability)
- is the simplicity and speed with which a system can be repaired or
- maintained
-
- * Generally measured on Mean Time Between Repair (MTBR)
-
-Improving RAS
--------------
-
-In order to reduce systems downtime, a system should be capable of detecting
-hardware errors, and, when possible correcting them in runtime. It should
-also provide mechanisms to detect hardware degradation, in order to warn
-the system administrator to take the action of replacing a component before
-it causes data loss or system downtime.
-
-Among the monitoring measures, the most usual ones include:
-
-* CPU – detect errors at instruction execution and at L1/L2/L3 caches;
-* Memory – add error correction logic (ECC) to detect and correct errors;
-* I/O – add CRC checksums for transferred data;
-* Storage – RAID, journal file systems, checksums,
- Self-Monitoring, Analysis and Reporting Technology (SMART).
-
-By monitoring the number of occurrences of error detections, it is possible
-to identify if the probability of hardware errors is increasing, and, on such
-case, do a preventive maintenance to replace a degraded component while
-those errors are correctable.
-
-Types of errors
----------------
-
-Most mechanisms used on modern systems use technologies like Hamming
-Codes that allow error correction when the number of errors on a bit packet
-is below a threshold. If the number of errors is above, those mechanisms
-can indicate with a high degree of confidence that an error happened, but
-they can't correct.
-
-Also, sometimes an error occur on a component that it is not used. For
-example, a part of the memory that it is not currently allocated.
-
-That defines some categories of errors:
-
-* **Correctable Error (CE)** - the error detection mechanism detected and
- corrected the error. Such errors are usually not fatal, although some
- Kernel mechanisms allow the system administrator to consider them as fatal.
-
-* **Uncorrected Error (UE)** - the amount of errors happened above the error
- correction threshold, and the system was unable to auto-correct.
-
-* **Fatal Error** - when an UE error happens on a critical component of the
- system (for example, a piece of the Kernel got corrupted by an UE), the
- only reliable way to avoid data corruption is to hang or reboot the machine.
-
-* **Non-fatal Error** - when an UE error happens on an unused component,
- like a CPU in power down state or an unused memory bank, the system may
- still run, eventually replacing the affected hardware by a hot spare,
- if available.
-
- Also, when an error happens on a userspace process, it is also possible to
- kill such process and let userspace restart it.
-
-The mechanism for handling non-fatal errors is usually complex and may
-require the help of some userspace application, in order to apply the
-policy desired by the system administrator.
-
-Identifying a bad hardware component
-------------------------------------
-
-Just detecting a hardware flaw is usually not enough, as the system needs
-to pinpoint to the minimal replaceable unit (MRU) that should be exchanged
-to make the hardware reliable again.
-
-So, it requires not only error logging facilities, but also mechanisms that
-will translate the error message to the silkscreen or component label for
-the MRU.
-
-Typically, it is very complex for memory, as modern CPUs interlace memory
-from different memory modules, in order to provide a better performance. The
-DMI BIOS usually have a list of memory module labels, with can be obtained
-using the ``dmidecode`` tool. For example, on a desktop machine, it shows::
-
- Memory Device
- Total Width: 64 bits
- Data Width: 64 bits
- Size: 16384 MB
- Form Factor: SODIMM
- Set: None
- Locator: ChannelA-DIMM0
- Bank Locator: BANK 0
- Type: DDR4
- Type Detail: Synchronous
- Speed: 2133 MHz
- Rank: 2
- Configured Clock Speed: 2133 MHz
-
-On the above example, a DDR4 SO-DIMM memory module is located at the
-system's memory labeled as "BANK 0", as given by the *bank locator* field.
-Please notice that, on such system, the *total width* is equal to the
-*data width*. It means that such memory module doesn't have error
-detection/correction mechanisms.
-
-Unfortunately, not all systems use the same field to specify the memory
-bank. On this example, from an older server, ``dmidecode`` shows::
-
- Memory Device
- Array Handle: 0x1000
- Error Information Handle: Not Provided
- Total Width: 72 bits
- Data Width: 64 bits
- Size: 8192 MB
- Form Factor: DIMM
- Set: 1
- Locator: DIMM_A1
- Bank Locator: Not Specified
- Type: DDR3
- Type Detail: Synchronous Registered (Buffered)
- Speed: 1600 MHz
- Rank: 2
- Configured Clock Speed: 1600 MHz
-
-There, the DDR3 RDIMM memory module is located at the system's memory labeled
-as "DIMM_A1", as given by the *locator* field. Please notice that this
-memory module has 64 bits of *data width* and 72 bits of *total width*. So,
-it has 8 extra bits to be used by error detection and correction mechanisms.
-Such kind of memory is called Error-correcting code memory (ECC memory).
-
-To make things even worse, it is not uncommon that systems with different
-labels on their system's board to use exactly the same BIOS, meaning that
-the labels provided by the BIOS won't match the real ones.
-
-ECC memory
-----------
-
-As mentioned in the previous section, ECC memory has extra bits to be
-used for error correction. In the above example, a memory module has
-64 bits of *data width*, and 72 bits of *total width*. The extra 8
-bits which are used for the error detection and correction mechanisms
-are referred to as the *syndrome*\ [#f1]_\ [#f2]_.
-
-So, when the cpu requests the memory controller to write a word with
-*data width*, the memory controller calculates the *syndrome* in real time,
-using Hamming code, or some other error correction code, like SECDED+,
-producing a code with *total width* size. Such code is then written
-on the memory modules.
-
-At read, the *total width* bits code is converted back, using the same
-ECC code used on write, producing a word with *data width* and a *syndrome*.
-The word with *data width* is sent to the CPU, even when errors happen.
-
-The memory controller also looks at the *syndrome* in order to check if
-there was an error, and if the ECC code was able to fix such error.
-If the error was corrected, a Corrected Error (CE) happened. If not, an
-Uncorrected Error (UE) happened.
-
-The information about the CE/UE errors is stored on some special registers
-at the memory controller and can be accessed by reading such registers,
-either by BIOS, by some special CPUs or by Linux EDAC driver. On x86 64
-bit CPUs, such errors can also be retrieved via the Machine Check
-Architecture (MCA)\ [#f3]_.
-
-.. [#f1] Please notice that several memory controllers allow operation on a
- mode called "Lock-Step", where it groups two memory modules together,
- doing 128-bit reads/writes. That gives 16 bits for error correction, with
- significantly improves the error correction mechanism, at the expense
- that, when an error happens, there's no way to know what memory module is
- to blame. So, it has to blame both memory modules.
-
-.. [#f2] Some memory controllers also allow using memory in mirror mode.
- On such mode, the same data is written to two memory modules. At read,
- the system checks both memory modules, in order to check if both provide
- identical data. On such configuration, when an error happens, there's no
- way to know what memory module is to blame. So, it has to blame both
- memory modules (or 4 memory modules, if the system is also on Lock-step
- mode).
-
-.. [#f3] For more details about the Machine Check Architecture (MCA),
- please read Documentation/arch/x86/x86_64/machinecheck.rst at the Kernel tree.
-
-EDAC - Error Detection And Correction
-*************************************
-
-.. note::
-
- "bluesmoke" was the name for this device driver subsystem when it
- was "out-of-tree" and maintained at http://bluesmoke.sourceforge.net.
- That site is mostly archaic now and can be used only for historical
- purposes.
-
- When the subsystem was pushed upstream for the first time, on
- Kernel 2.6.16, it was renamed to ``EDAC``.
-
-Purpose
--------
-
-The ``edac`` kernel module's goal is to detect and report hardware errors
-that occur within the computer system running under linux.
-
-Memory
-------
-
-Memory Correctable Errors (CE) and Uncorrectable Errors (UE) are the
-primary errors being harvested. These types of errors are harvested by
-the ``edac_mc`` device.
-
-Detecting CE events, then harvesting those events and reporting them,
-**can** but must not necessarily be a predictor of future UE events. With
-CE events only, the system can and will continue to operate as no data
-has been damaged yet.
-
-However, preventive maintenance and proactive part replacement of memory
-modules exhibiting CEs can reduce the likelihood of the dreaded UE events
-and system panics.
-
-Other hardware elements
------------------------
-
-A new feature for EDAC, the ``edac_device`` class of device, was added in
-the 2.6.23 version of the kernel.
-
-This new device type allows for non-memory type of ECC hardware detectors
-to have their states harvested and presented to userspace via the sysfs
-interface.
-
-Some architectures have ECC detectors for L1, L2 and L3 caches,
-along with DMA engines, fabric switches, main data path switches,
-interconnections, and various other hardware data paths. If the hardware
-reports it, then a edac_device device probably can be constructed to
-harvest and present that to userspace.
-
-
-PCI bus scanning
-----------------
-
-In addition, PCI devices are scanned for PCI Bus Parity and SERR Errors
-in order to determine if errors are occurring during data transfers.
-
-The presence of PCI Parity errors must be examined with a grain of salt.
-There are several add-in adapters that do **not** follow the PCI specification
-with regards to Parity generation and reporting. The specification says
-the vendor should tie the parity status bits to 0 if they do not intend
-to generate parity. Some vendors do not do this, and thus the parity bit
-can "float" giving false positives.
-
-There is a PCI device attribute located in sysfs that is checked by
-the EDAC PCI scanning code. If that attribute is set, PCI parity/error
-scanning is skipped for that device. The attribute is::
-
- broken_parity_status
-
-and is located in ``/sys/devices/pci<XXX>/0000:XX:YY.Z`` directories for
-PCI devices.
-
-
-Versioning
-----------
-
-EDAC is composed of a "core" module (``edac_core.ko``) and several Memory
-Controller (MC) driver modules. On a given system, the CORE is loaded
-and one MC driver will be loaded. Both the CORE and the MC driver (or
-``edac_device`` driver) have individual versions that reflect current
-release level of their respective modules.
-
-Thus, to "report" on what version a system is running, one must report
-both the CORE's and the MC driver's versions.
-
-
-Loading
--------
-
-If ``edac`` was statically linked with the kernel then no loading
-is necessary. If ``edac`` was built as modules then simply modprobe
-the ``edac`` pieces that you need. You should be able to modprobe
-hardware-specific modules and have the dependencies load the necessary
-core modules.
-
-Example::
-
- $ modprobe amd76x_edac
-
-loads both the ``amd76x_edac.ko`` memory controller module and the
-``edac_mc.ko`` core module.
-
-
-Sysfs interface
----------------
-
-EDAC presents a ``sysfs`` interface for control and reporting purposes. It
-lives in the /sys/devices/system/edac directory.
-
-Within this directory there currently reside 2 components:
-
- ======= ==============================
- mc memory controller(s) system
- pci PCI control and status system
- ======= ==============================
-
-
-
-Memory Controller (mc) Model
-----------------------------
-
-Each ``mc`` device controls a set of memory modules [#f4]_. These modules
-are laid out in a Chip-Select Row (``csrowX``) and Channel table (``chX``).
-There can be multiple csrows and multiple channels.
-
-.. [#f4] Nowadays, the term DIMM (Dual In-line Memory Module) is widely
- used to refer to a memory module, although there are other memory
- packaging alternatives, like SO-DIMM, SIMM, etc. The UEFI
- specification (Version 2.7) defines a memory module in the Common
- Platform Error Record (CPER) section to be an SMBIOS Memory Device
- (Type 17). Along this document, and inside the EDAC subsystem, the term
- "dimm" is used for all memory modules, even when they use a
- different kind of packaging.
-
-Memory controllers allow for several csrows, with 8 csrows being a
-typical value. Yet, the actual number of csrows depends on the layout of
-a given motherboard, memory controller and memory module characteristics.
-
-Dual channels allow for dual data length (e. g. 128 bits, on 64 bit systems)
-data transfers to/from the CPU from/to memory. Some newer chipsets allow
-for more than 2 channels, like Fully Buffered DIMMs (FB-DIMMs) memory
-controllers. The following example will assume 2 channels:
-
- +------------+-----------------------+
- | CS Rows | Channels |
- +------------+-----------+-----------+
- | | ``ch0`` | ``ch1`` |
- +============+===========+===========+
- | |**DIMM_A0**|**DIMM_B0**|
- +------------+-----------+-----------+
- | ``csrow0`` | rank0 | rank0 |
- +------------+-----------+-----------+
- | ``csrow1`` | rank1 | rank1 |
- +------------+-----------+-----------+
- | |**DIMM_A1**|**DIMM_B1**|
- +------------+-----------+-----------+
- | ``csrow2`` | rank0 | rank0 |
- +------------+-----------+-----------+
- | ``csrow3`` | rank1 | rank1 |
- +------------+-----------+-----------+
-
-In the above example, there are 4 physical slots on the motherboard
-for memory DIMMs:
-
- +---------+---------+
- | DIMM_A0 | DIMM_B0 |
- +---------+---------+
- | DIMM_A1 | DIMM_B1 |
- +---------+---------+
-
-Labels for these slots are usually silk-screened on the motherboard.
-Slots labeled ``A`` are channel 0 in this example. Slots labeled ``B`` are
-channel 1. Notice that there are two csrows possible on a physical DIMM.
-These csrows are allocated their csrow assignment based on the slot into
-which the memory DIMM is placed. Thus, when 1 DIMM is placed in each
-Channel, the csrows cross both DIMMs.
-
-Memory DIMMs come single or dual "ranked". A rank is a populated csrow.
-In the example above 2 dual ranked DIMMs are similarly placed. Thus,
-both csrow0 and csrow1 are populated. On the other hand, when 2 single
-ranked DIMMs are placed in slots DIMM_A0 and DIMM_B0, then they will
-have just one csrow (csrow0) and csrow1 will be empty. The pattern
-repeats itself for csrow2 and csrow3. Also note that some memory
-controllers don't have any logic to identify the memory module, see
-``rankX`` directories below.
-
-The representation of the above is reflected in the directory
-tree in EDAC's sysfs interface. Starting in directory
-``/sys/devices/system/edac/mc``, each memory controller will be
-represented by its own ``mcX`` directory, where ``X`` is the
-index of the MC::
-
- ..../edac/mc/
- |
- |->mc0
- |->mc1
- |->mc2
- ....
-
-Under each ``mcX`` directory each ``csrowX`` is again represented by a
-``csrowX``, where ``X`` is the csrow index::
-
- .../mc/mc0/
- |
- |->csrow0
- |->csrow2
- |->csrow3
- ....
-
-Notice that there is no csrow1, which indicates that csrow0 is composed
-of a single ranked DIMMs. This should also apply in both Channels, in
-order to have dual-channel mode be operational. Since both csrow2 and
-csrow3 are populated, this indicates a dual ranked set of DIMMs for
-channels 0 and 1.
-
-Within each of the ``mcX`` and ``csrowX`` directories are several EDAC
-control and attribute files.
-
-``mcX`` directories
--------------------
-
-In ``mcX`` directories are EDAC control and attribute files for
-this ``X`` instance of the memory controllers.
-
-For a description of the sysfs API, please see:
-
- Documentation/ABI/testing/sysfs-devices-edac
-
-
-``dimmX`` or ``rankX`` directories
-----------------------------------
-
-The recommended way to use the EDAC subsystem is to look at the information
-provided by the ``dimmX`` or ``rankX`` directories [#f5]_.
-
-A typical EDAC system has the following structure under
-``/sys/devices/system/edac/``\ [#f6]_::
-
- /sys/devices/system/edac/
- ├── mc
- │ ├── mc0
- │ │ ├── ce_count
- │ │ ├── ce_noinfo_count
- │ │ ├── dimm0
- │ │ │ ├── dimm_ce_count
- │ │ │ ├── dimm_dev_type
- │ │ │ ├── dimm_edac_mode
- │ │ │ ├── dimm_label
- │ │ │ ├── dimm_location
- │ │ │ ├── dimm_mem_type
- │ │ │ ├── dimm_ue_count
- │ │ │ ├── size
- │ │ │ └── uevent
- │ │ ├── max_location
- │ │ ├── mc_name
- │ │ ├── reset_counters
- │ │ ├── seconds_since_reset
- │ │ ├── size_mb
- │ │ ├── ue_count
- │ │ ├── ue_noinfo_count
- │ │ └── uevent
- │ ├── mc1
- │ │ ├── ce_count
- │ │ ├── ce_noinfo_count
- │ │ ├── dimm0
- │ │ │ ├── dimm_ce_count
- │ │ │ ├── dimm_dev_type
- │ │ │ ├── dimm_edac_mode
- │ │ │ ├── dimm_label
- │ │ │ ├── dimm_location
- │ │ │ ├── dimm_mem_type
- │ │ │ ├── dimm_ue_count
- │ │ │ ├── size
- │ │ │ └── uevent
- │ │ ├── max_location
- │ │ ├── mc_name
- │ │ ├── reset_counters
- │ │ ├── seconds_since_reset
- │ │ ├── size_mb
- │ │ ├── ue_count
- │ │ ├── ue_noinfo_count
- │ │ └── uevent
- │ └── uevent
- └── uevent
-
-In the ``dimmX`` directories are EDAC control and attribute files for
-this ``X`` memory module:
-
-- ``size`` - Total memory managed by this csrow attribute file
-
- This attribute file displays, in count of megabytes, the memory
- that this csrow contains.
-
-- ``dimm_ue_count`` - Uncorrectable Errors count attribute file
-
- This attribute file displays the total count of uncorrectable
- errors that have occurred on this DIMM. If panic_on_ue is set
- this counter will not have a chance to increment, since EDAC
- will panic the system.
-
-- ``dimm_ce_count`` - Correctable Errors count attribute file
-
- This attribute file displays the total count of correctable
- errors that have occurred on this DIMM. This count is very
- important to examine. CEs provide early indications that a
- DIMM is beginning to fail. This count field should be
- monitored for non-zero values and report such information
- to the system administrator.
-
-- ``dimm_dev_type`` - Device type attribute file
-
- This attribute file will display what type of DRAM device is
- being utilized on this DIMM.
- Examples:
-
- - x1
- - x2
- - x4
- - x8
-
-- ``dimm_edac_mode`` - EDAC Mode of operation attribute file
-
- This attribute file will display what type of Error detection
- and correction is being utilized.
-
-- ``dimm_label`` - memory module label control file
-
- This control file allows this DIMM to have a label assigned
- to it. With this label in the module, when errors occur
- the output can provide the DIMM label in the system log.
- This becomes vital for panic events to isolate the
- cause of the UE event.
-
- DIMM Labels must be assigned after booting, with information
- that correctly identifies the physical slot with its
- silk screen label. This information is currently very
- motherboard specific and determination of this information
- must occur in userland at this time.
-
-- ``dimm_location`` - location of the memory module
-
- The location can have up to 3 levels, and describe how the
- memory controller identifies the location of a memory module.
- Depending on the type of memory and memory controller, it
- can be:
-
- - *csrow* and *channel* - used when the memory controller
- doesn't identify a single DIMM - e. g. in ``rankX`` dir;
- - *branch*, *channel*, *slot* - typically used on FB-DIMM memory
- controllers;
- - *channel*, *slot* - used on Nehalem and newer Intel drivers.
-
-- ``dimm_mem_type`` - Memory Type attribute file
-
- This attribute file will display what type of memory is currently
- on this csrow. Normally, either buffered or unbuffered memory.
- Examples:
-
- - Registered-DDR
- - Unbuffered-DDR
-
-.. [#f5] On some systems, the memory controller doesn't have any logic
- to identify the memory module. On such systems, the directory is called ``rankX`` and works on a similar way as the ``csrowX`` directories.
- On modern Intel memory controllers, the memory controller identifies the
- memory modules directly. On such systems, the directory is called ``dimmX``.
-
-.. [#f6] There are also some ``power`` directories and ``subsystem``
- symlinks inside the sysfs mapping that are automatically created by
- the sysfs subsystem. Currently, they serve no purpose.
-
-``csrowX`` directories
-----------------------
-
-When CONFIG_EDAC_LEGACY_SYSFS is enabled, sysfs will contain the ``csrowX``
-directories. As this API doesn't work properly for Rambus, FB-DIMMs and
-modern Intel Memory Controllers, this is being deprecated in favor of
-``dimmX`` directories.
-
-In the ``csrowX`` directories are EDAC control and attribute files for
-this ``X`` instance of csrow:
-
-
-- ``ue_count`` - Total Uncorrectable Errors count attribute file
-
- This attribute file displays the total count of uncorrectable
- errors that have occurred on this csrow. If panic_on_ue is set
- this counter will not have a chance to increment, since EDAC
- will panic the system.
-
-
-- ``ce_count`` - Total Correctable Errors count attribute file
-
- This attribute file displays the total count of correctable
- errors that have occurred on this csrow. This count is very
- important to examine. CEs provide early indications that a
- DIMM is beginning to fail. This count field should be
- monitored for non-zero values and report such information
- to the system administrator.
-
-
-- ``size_mb`` - Total memory managed by this csrow attribute file
-
- This attribute file displays, in count of megabytes, the memory
- that this csrow contains.
-
-
-- ``mem_type`` - Memory Type attribute file
-
- This attribute file will display what type of memory is currently
- on this csrow. Normally, either buffered or unbuffered memory.
- Examples:
-
- - Registered-DDR
- - Unbuffered-DDR
-
-
-- ``edac_mode`` - EDAC Mode of operation attribute file
-
- This attribute file will display what type of Error detection
- and correction is being utilized.
-
-
-- ``dev_type`` - Device type attribute file
-
- This attribute file will display what type of DRAM device is
- being utilized on this DIMM.
- Examples:
-
- - x1
- - x2
- - x4
- - x8
-
-
-- ``ch0_ce_count`` - Channel 0 CE Count attribute file
-
- This attribute file will display the count of CEs on this
- DIMM located in channel 0.
-
-
-- ``ch0_ue_count`` - Channel 0 UE Count attribute file
-
- This attribute file will display the count of UEs on this
- DIMM located in channel 0.
-
-
-- ``ch0_dimm_label`` - Channel 0 DIMM Label control file
-
-
- This control file allows this DIMM to have a label assigned
- to it. With this label in the module, when errors occur
- the output can provide the DIMM label in the system log.
- This becomes vital for panic events to isolate the
- cause of the UE event.
-
- DIMM Labels must be assigned after booting, with information
- that correctly identifies the physical slot with its
- silk screen label. This information is currently very
- motherboard specific and determination of this information
- must occur in userland at this time.
-
-
-- ``ch1_ce_count`` - Channel 1 CE Count attribute file
-
-
- This attribute file will display the count of CEs on this
- DIMM located in channel 1.
-
-
-- ``ch1_ue_count`` - Channel 1 UE Count attribute file
-
-
- This attribute file will display the count of UEs on this
- DIMM located in channel 0.
-
-
-- ``ch1_dimm_label`` - Channel 1 DIMM Label control file
-
- This control file allows this DIMM to have a label assigned
- to it. With this label in the module, when errors occur
- the output can provide the DIMM label in the system log.
- This becomes vital for panic events to isolate the
- cause of the UE event.
-
- DIMM Labels must be assigned after booting, with information
- that correctly identifies the physical slot with its
- silk screen label. This information is currently very
- motherboard specific and determination of this information
- must occur in userland at this time.
-
-
-System Logging
---------------
-
-If logging for UEs and CEs is enabled, then system logs will contain
-information indicating that errors have been detected::
-
- EDAC MC0: CE page 0x283, offset 0xce0, grain 8, syndrome 0x6ec3, row 0, channel 1 "DIMM_B1": amd76x_edac
- EDAC MC0: CE page 0x1e5, offset 0xfb0, grain 8, syndrome 0xb741, row 0, channel 1 "DIMM_B1": amd76x_edac
-
-
-The structure of the message is:
-
- +---------------------------------------+-------------+
- | Content | Example |
- +=======================================+=============+
- | The memory controller | MC0 |
- +---------------------------------------+-------------+
- | Error type | CE |
- +---------------------------------------+-------------+
- | Memory page | 0x283 |
- +---------------------------------------+-------------+
- | Offset in the page | 0xce0 |
- +---------------------------------------+-------------+
- | The byte granularity | grain 8 |
- | or resolution of the error | |
- +---------------------------------------+-------------+
- | The error syndrome | 0xb741 |
- +---------------------------------------+-------------+
- | Memory row | row 0 |
- +---------------------------------------+-------------+
- | Memory channel | channel 1 |
- +---------------------------------------+-------------+
- | DIMM label, if set prior | DIMM B1 |
- +---------------------------------------+-------------+
- | And then an optional, driver-specific | |
- | message that may have additional | |
- | information. | |
- +---------------------------------------+-------------+
-
-Both UEs and CEs with no info will lack all but memory controller, error
-type, a notice of "no info" and then an optional, driver-specific error
-message.
-
-
-PCI Bus Parity Detection
-------------------------
-
-On Header Type 00 devices, the primary status is looked at for any
-parity error regardless of whether parity is enabled on the device or
-not. (The spec indicates parity is generated in some cases). On Header
-Type 01 bridges, the secondary status register is also looked at to see
-if parity occurred on the bus on the other side of the bridge.
-
-
-Sysfs configuration
--------------------
-
-Under ``/sys/devices/system/edac/pci`` are control and attribute files as
-follows:
-
-
-- ``check_pci_parity`` - Enable/Disable PCI Parity checking control file
-
- This control file enables or disables the PCI Bus Parity scanning
- operation. Writing a 1 to this file enables the scanning. Writing
- a 0 to this file disables the scanning.
-
- Enable::
-
- echo "1" >/sys/devices/system/edac/pci/check_pci_parity
-
- Disable::
-
- echo "0" >/sys/devices/system/edac/pci/check_pci_parity
-
-
-- ``pci_parity_count`` - Parity Count
-
- This attribute file will display the number of parity errors that
- have been detected.
-
-
-Module parameters
------------------
-
-- ``edac_mc_panic_on_ue`` - Panic on UE control file
-
- An uncorrectable error will cause a machine panic. This is usually
- desirable. It is a bad idea to continue when an uncorrectable error
- occurs - it is indeterminate what was uncorrected and the operating
- system context might be so mangled that continuing will lead to further
- corruption. If the kernel has MCE configured, then EDAC will never
- notice the UE.
-
- LOAD TIME::
-
- module/kernel parameter: edac_mc_panic_on_ue=[0|1]
-
- RUN TIME::
-
- echo "1" > /sys/module/edac_core/parameters/edac_mc_panic_on_ue
-
-
-- ``edac_mc_log_ue`` - Log UE control file
-
-
- Generate kernel messages describing uncorrectable errors. These errors
- are reported through the system message log system. UE statistics
- will be accumulated even when UE logging is disabled.
-
- LOAD TIME::
-
- module/kernel parameter: edac_mc_log_ue=[0|1]
-
- RUN TIME::
-
- echo "1" > /sys/module/edac_core/parameters/edac_mc_log_ue
-
-
-- ``edac_mc_log_ce`` - Log CE control file
-
-
- Generate kernel messages describing correctable errors. These
- errors are reported through the system message log system.
- CE statistics will be accumulated even when CE logging is disabled.
-
- LOAD TIME::
-
- module/kernel parameter: edac_mc_log_ce=[0|1]
-
- RUN TIME::
-
- echo "1" > /sys/module/edac_core/parameters/edac_mc_log_ce
-
-
-- ``edac_mc_poll_msec`` - Polling period control file
-
-
- The time period, in milliseconds, for polling for error information.
- Too small a value wastes resources. Too large a value might delay
- necessary handling of errors and might loose valuable information for
- locating the error. 1000 milliseconds (once each second) is the current
- default. Systems which require all the bandwidth they can get, may
- increase this.
-
- LOAD TIME::
-
- module/kernel parameter: edac_mc_poll_msec=[0|1]
-
- RUN TIME::
-
- echo "1000" > /sys/module/edac_core/parameters/edac_mc_poll_msec
-
-
-- ``panic_on_pci_parity`` - Panic on PCI PARITY Error
-
-
- This control file enables or disables panicking when a parity
- error has been detected.
-
-
- module/kernel parameter::
-
- edac_panic_on_pci_pe=[0|1]
-
- Enable::
-
- echo "1" > /sys/module/edac_core/parameters/edac_panic_on_pci_pe
-
- Disable::
-
- echo "0" > /sys/module/edac_core/parameters/edac_panic_on_pci_pe
-
-
-
-EDAC device type
-----------------
-
-In the header file, edac_pci.h, there is a series of edac_device structures
-and APIs for the EDAC_DEVICE.
-
-User space access to an edac_device is through the sysfs interface.
-
-At the location ``/sys/devices/system/edac`` (sysfs) new edac_device devices
-will appear.
-
-There is a three level tree beneath the above ``edac`` directory. For example,
-the ``test_device_edac`` device (found at the http://bluesmoke.sourceforget.net
-website) installs itself as::
-
- /sys/devices/system/edac/test-instance
-
-in this directory are various controls, a symlink and one or more ``instance``
-directories.
-
-The standard default controls are:
-
- ============== =======================================================
- log_ce boolean to log CE events
- log_ue boolean to log UE events
- panic_on_ue boolean to ``panic`` the system if an UE is encountered
- (default off, can be set true via startup script)
- poll_msec time period between POLL cycles for events
- ============== =======================================================
-
-The test_device_edac device adds at least one of its own custom control:
-
- ============== ==================================================
- test_bits which in the current test driver does nothing but
- show how it is installed. A ported driver can
- add one or more such controls and/or attributes
- for specific uses.
- One out-of-tree driver uses controls here to allow
- for ERROR INJECTION operations to hardware
- injection registers
- ============== ==================================================
-
-The symlink points to the 'struct dev' that is registered for this edac_device.
-
-Instances
----------
-
-One or more instance directories are present. For the ``test_device_edac``
-case:
-
- +----------------+
- | test-instance0 |
- +----------------+
-
-
-In this directory there are two default counter attributes, which are totals of
-counter in deeper subdirectories.
-
- ============== ====================================
- ce_count total of CE events of subdirectories
- ue_count total of UE events of subdirectories
- ============== ====================================
-
-Blocks
-------
-
-At the lowest directory level is the ``block`` directory. There can be 0, 1
-or more blocks specified in each instance:
-
- +-------------+
- | test-block0 |
- +-------------+
-
-In this directory the default attributes are:
-
- ============== ================================================
- ce_count which is counter of CE events for this ``block``
- of hardware being monitored
- ue_count which is counter of UE events for this ``block``
- of hardware being monitored
- ============== ================================================
-
-
-The ``test_device_edac`` device adds 4 attributes and 1 control:
-
- ================== ====================================================
- test-block-bits-0 for every POLL cycle this counter
- is incremented
- test-block-bits-1 every 10 cycles, this counter is bumped once,
- and test-block-bits-0 is set to 0
- test-block-bits-2 every 100 cycles, this counter is bumped once,
- and test-block-bits-1 is set to 0
- test-block-bits-3 every 1000 cycles, this counter is bumped once,
- and test-block-bits-2 is set to 0
- ================== ====================================================
-
-
- ================== ====================================================
- reset-counters writing ANY thing to this control will
- reset all the above counters.
- ================== ====================================================
-
-
-Use of the ``test_device_edac`` driver should enable any others to create their own
-unique drivers for their hardware systems.
-
-The ``test_device_edac`` sample driver is located at the
-http://bluesmoke.sourceforge.net project site for EDAC.
-
-
-Usage of EDAC APIs on Nehalem and newer Intel CPUs
---------------------------------------------------
-
-On older Intel architectures, the memory controller was part of the North
-Bridge chipset. Nehalem, Sandy Bridge, Ivy Bridge, Haswell, Sky Lake and
-newer Intel architectures integrated an enhanced version of the memory
-controller (MC) inside the CPUs.
-
-This chapter will cover the differences of the enhanced memory controllers
-found on newer Intel CPUs, such as ``i7core_edac``, ``sb_edac`` and
-``sbx_edac`` drivers.
-
-.. note::
-
- The Xeon E7 processor families use a separate chip for the memory
- controller, called Intel Scalable Memory Buffer. This section doesn't
- apply for such families.
-
-1) There is one Memory Controller per Quick Patch Interconnect
- (QPI). At the driver, the term "socket" means one QPI. This is
- associated with a physical CPU socket.
-
- Each MC have 3 physical read channels, 3 physical write channels and
- 3 logic channels. The driver currently sees it as just 3 channels.
- Each channel can have up to 3 DIMMs.
-
- The minimum known unity is DIMMs. There are no information about csrows.
- As EDAC API maps the minimum unity is csrows, the driver sequentially
- maps channel/DIMM into different csrows.
-
- For example, supposing the following layout::
-
- Ch0 phy rd0, wr0 (0x063f4031): 2 ranks, UDIMMs
- dimm 0 1024 Mb offset: 0, bank: 8, rank: 1, row: 0x4000, col: 0x400
- dimm 1 1024 Mb offset: 4, bank: 8, rank: 1, row: 0x4000, col: 0x400
- Ch1 phy rd1, wr1 (0x063f4031): 2 ranks, UDIMMs
- dimm 0 1024 Mb offset: 0, bank: 8, rank: 1, row: 0x4000, col: 0x400
- Ch2 phy rd3, wr3 (0x063f4031): 2 ranks, UDIMMs
- dimm 0 1024 Mb offset: 0, bank: 8, rank: 1, row: 0x4000, col: 0x400
-
- The driver will map it as::
-
- csrow0: channel 0, dimm0
- csrow1: channel 0, dimm1
- csrow2: channel 1, dimm0
- csrow3: channel 2, dimm0
-
- exports one DIMM per csrow.
-
- Each QPI is exported as a different memory controller.
-
-2) The MC has the ability to inject errors to test drivers. The drivers
- implement this functionality via some error injection nodes:
-
- For injecting a memory error, there are some sysfs nodes, under
- ``/sys/devices/system/edac/mc/mc?/``:
-
- - ``inject_addrmatch/*``:
- Controls the error injection mask register. It is possible to specify
- several characteristics of the address to match an error code::
-
- dimm = the affected dimm. Numbers are relative to a channel;
- rank = the memory rank;
- channel = the channel that will generate an error;
- bank = the affected bank;
- page = the page address;
- column (or col) = the address column.
-
- each of the above values can be set to "any" to match any valid value.
-
- At driver init, all values are set to any.
-
- For example, to generate an error at rank 1 of dimm 2, for any channel,
- any bank, any page, any column::
-
- echo 2 >/sys/devices/system/edac/mc/mc0/inject_addrmatch/dimm
- echo 1 >/sys/devices/system/edac/mc/mc0/inject_addrmatch/rank
-
- To return to the default behaviour of matching any, you can do::
-
- echo any >/sys/devices/system/edac/mc/mc0/inject_addrmatch/dimm
- echo any >/sys/devices/system/edac/mc/mc0/inject_addrmatch/rank
-
- - ``inject_eccmask``:
- specifies what bits will have troubles,
-
- - ``inject_section``:
- specifies what ECC cache section will get the error::
-
- 3 for both
- 2 for the highest
- 1 for the lowest
-
- - ``inject_type``:
- specifies the type of error, being a combination of the following bits::
-
- bit 0 - repeat
- bit 1 - ecc
- bit 2 - parity
-
- - ``inject_enable``:
- starts the error generation when something different than 0 is written.
-
- All inject vars can be read. root permission is needed for write.
-
- Datasheet states that the error will only be generated after a write on an
- address that matches inject_addrmatch. It seems, however, that reading will
- also produce an error.
-
- For example, the following code will generate an error for any write access
- at socket 0, on any DIMM/address on channel 2::
-
- echo 2 >/sys/devices/system/edac/mc/mc0/inject_addrmatch/channel
- echo 2 >/sys/devices/system/edac/mc/mc0/inject_type
- echo 64 >/sys/devices/system/edac/mc/mc0/inject_eccmask
- echo 3 >/sys/devices/system/edac/mc/mc0/inject_section
- echo 1 >/sys/devices/system/edac/mc/mc0/inject_enable
- dd if=/dev/mem of=/dev/null seek=16k bs=4k count=1 >& /dev/null
-
- For socket 1, it is needed to replace "mc0" by "mc1" at the above
- commands.
-
- The generated error message will look like::
-
- EDAC MC0: UE row 0, channel-a= 0 channel-b= 0 labels "-": NON_FATAL (addr = 0x0075b980, socket=0, Dimm=0, Channel=2, syndrome=0x00000040, count=1, Err=8c0000400001009f:4000080482 (read error: read ECC error))
-
-3) Corrected Error memory register counters
-
- Those newer MCs have some registers to count memory errors. The driver
- uses those registers to report Corrected Errors on devices with Registered
- DIMMs.
-
- However, those counters don't work with Unregistered DIMM. As the chipset
- offers some counters that also work with UDIMMs (but with a worse level of
- granularity than the default ones), the driver exposes those registers for
- UDIMM memories.
-
- They can be read by looking at the contents of ``all_channel_counts/``::
-
- $ for i in /sys/devices/system/edac/mc/mc0/all_channel_counts/*; do echo $i; cat $i; done
- /sys/devices/system/edac/mc/mc0/all_channel_counts/udimm0
- 0
- /sys/devices/system/edac/mc/mc0/all_channel_counts/udimm1
- 0
- /sys/devices/system/edac/mc/mc0/all_channel_counts/udimm2
- 0
-
- What happens here is that errors on different csrows, but at the same
- dimm number will increment the same counter.
- So, in this memory mapping::
-
- csrow0: channel 0, dimm0
- csrow1: channel 0, dimm1
- csrow2: channel 1, dimm0
- csrow3: channel 2, dimm0
-
- The hardware will increment udimm0 for an error at the first dimm at either
- csrow0, csrow2 or csrow3;
-
- The hardware will increment udimm1 for an error at the second dimm at either
- csrow0, csrow2 or csrow3;
-
- The hardware will increment udimm2 for an error at the third dimm at either
- csrow0, csrow2 or csrow3;
-
-4) Standard error counters
-
- The standard error counters are generated when an mcelog error is received
- by the driver. Since, with UDIMM, this is counted by software, it is
- possible that some errors could be lost. With RDIMM's, they display the
- contents of the registers
-
-Reference documents used on ``amd64_edac``
-------------------------------------------
-
-``amd64_edac`` module is based on the following documents
-(available from http://support.amd.com/en-us/search/tech-docs):
-
-1. :Title: BIOS and Kernel Developer's Guide for AMD Athlon 64 and AMD
- Opteron Processors
- :AMD publication #: 26094
- :Revision: 3.26
- :Link: http://support.amd.com/TechDocs/26094.PDF
-
-2. :Title: BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh
- Processors
- :AMD publication #: 32559
- :Revision: 3.00
- :Issue Date: May 2006
- :Link: http://support.amd.com/TechDocs/32559.pdf
-
-3. :Title: BIOS and Kernel Developer's Guide (BKDG) For AMD Family 10h
- Processors
- :AMD publication #: 31116
- :Revision: 3.00
- :Issue Date: September 07, 2007
- :Link: http://support.amd.com/TechDocs/31116.pdf
-
-4. :Title: BIOS and Kernel Developer's Guide (BKDG) for AMD Family 15h
- Models 30h-3Fh Processors
- :AMD publication #: 49125
- :Revision: 3.06
- :Issue Date: 2/12/2015 (latest release)
- :Link: http://support.amd.com/TechDocs/49125_15h_Models_30h-3Fh_BKDG.pdf
-
-5. :Title: BIOS and Kernel Developer's Guide (BKDG) for AMD Family 15h
- Models 60h-6Fh Processors
- :AMD publication #: 50742
- :Revision: 3.01
- :Issue Date: 7/23/2015 (latest release)
- :Link: http://support.amd.com/TechDocs/50742_15h_Models_60h-6Fh_BKDG.pdf
-
-6. :Title: BIOS and Kernel Developer's Guide (BKDG) for AMD Family 16h
- Models 00h-0Fh Processors
- :AMD publication #: 48751
- :Revision: 3.03
- :Issue Date: 2/23/2015 (latest release)
- :Link: http://support.amd.com/TechDocs/48751_16h_bkdg.pdf
-
-Credits
-=======
-
-* Written by Doug Thompson <dougthompson@xmission.com>
-
- - 7 Dec 2005
- - 17 Jul 2007 Updated
-
-* |copy| Mauro Carvalho Chehab
-
- - 05 Aug 2009 Nehalem interface
- - 26 Oct 2016 Converted to ReST and cleanups at the Nehalem section
-
-* EDAC authors/maintainers:
-
- - Doug Thompson, Dave Jiang, Dave Peterson et al,
- - Mauro Carvalho Chehab
- - Borislav Petkov
- - original author: Thayne Harbaugh
diff --git a/Documentation/admin-guide/reporting-regressions.rst b/Documentation/admin-guide/reporting-regressions.rst
index d8adccd..76b246e 100644
--- a/Documentation/admin-guide/reporting-regressions.rst
+++ b/Documentation/admin-guide/reporting-regressions.rst
@@ -31,7 +31,7 @@
Linux kernel regression tracking bot "regzbot" track the issue by specifying
when the regression started like this::
- #regzbot introduced v5.13..v5.14-rc1
+ #regzbot introduced: v5.13..v5.14-rc1
All the details on Linux kernel regressions relevant for users
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index 6584a1f..7fd43947 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -296,12 +296,30 @@
the console. This is very useful for capturing traces that lead to
crashes and outputting them to a serial console.
-= ===================================================
-0 Disabled (default).
-1 Dump buffers of all CPUs.
-2 Dump the buffer of the CPU that triggered the oops.
-= ===================================================
+======================= ===========================================
+0 Disabled (default).
+1 Dump buffers of all CPUs.
+2(orig_cpu) Dump the buffer of the CPU that triggered the
+ oops.
+<instance> Dump the specific instance buffer on all CPUs.
+<instance>=2(orig_cpu) Dump the specific instance buffer on the CPU
+ that triggered the oops.
+======================= ===========================================
+Multiple instance dump is also supported, and instances are separated
+by commas. If global buffer also needs to be dumped, please specify
+the dump mode (1/2/orig_cpu) first for global buffer.
+
+So for example to dump "foo" and "bar" instance buffer on all CPUs,
+user can::
+
+ echo "foo,bar" > /proc/sys/kernel/ftrace_dump_on_oops
+
+To dump global buffer and "foo" instance buffer on all
+CPUs along with the "bar" instance buffer on CPU that triggered the
+oops, user can::
+
+ echo "1,foo,bar=2" > /proc/sys/kernel/ftrace_dump_on_oops
ftrace_enabled, stack_tracer_enabled
====================================
@@ -594,6 +612,9 @@
``msgmni`` is the maximum number of IPC queues. 32000 by default
(``MSGMNI``).
+All of these parameters are set per ipc namespace. The maximum number of bytes
+in POSIX message queues is limited by ``RLIMIT_MSGQUEUE``. This limit is
+respected hierarchically in the each user namespace.
msg_next_id, sem_next_id, and shm_next_id (System V IPC)
========================================================
@@ -850,6 +871,7 @@
bit 4 print ftrace buffer
bit 5 print all printk messages in buffer
bit 6 print all CPUs backtrace (if available in the arch)
+bit 7 print only tasks in uninterruptible (blocked) state
===== ============================================
So for example to print tasks and memory info on panic, user can::
@@ -1274,15 +1296,20 @@
shmall
======
-This parameter sets the total amount of shared memory pages that
-can be used system wide. Hence, ``shmall`` should always be at least
-``ceil(shmmax/PAGE_SIZE)``.
+This parameter sets the total amount of shared memory pages that can be used
+inside ipc namespace. The shared memory pages counting occurs for each ipc
+namespace separately and is not inherited. Hence, ``shmall`` should always be at
+least ``ceil(shmmax/PAGE_SIZE)``.
If you are not sure what the default ``PAGE_SIZE`` is on your Linux
system, you can run the following command::
# getconf PAGE_SIZE
+To reduce or disable the ability to allocate shared memory, you must create a
+new ipc namespace, set this parameter to the required value and prohibit the
+creation of a new ipc namespace in the current user namespace or cgroups can
+be used.
shmmax
======
diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst
index 3960916..7250c05 100644
--- a/Documentation/admin-guide/sysctl/net.rst
+++ b/Documentation/admin-guide/sysctl/net.rst
@@ -206,6 +206,11 @@
Default: 0 (off)
+mem_pcpu_rsv
+------------
+
+Per-cpu reserved forward alloc cache size in page units. Default 1MB per CPU.
+
rmem_default
------------
diff --git a/Documentation/admin-guide/tainted-kernels.rst b/Documentation/admin-guide/tainted-kernels.rst
index 92a8a07..f925515 100644
--- a/Documentation/admin-guide/tainted-kernels.rst
+++ b/Documentation/admin-guide/tainted-kernels.rst
@@ -34,7 +34,7 @@
You'll find a 'Not tainted: ' there if the kernel was not tainted at the
time of the event; if it was, then it will print 'Tainted: ' and characters
-either letters or blanks. In above example it looks like this::
+either letters or blanks. In the example above it looks like this::
Tainted: P W O
@@ -52,7 +52,7 @@
tainted; any other number indicates the reasons why it is. The easiest way to
decode that number is the script ``tools/debugging/kernel-chktaint``, which your
distribution might ship as part of a package called ``linux-tools`` or
-``kernel-tools``; if it doesn't you can download the script from
+``kernel-tools``; if it doesn't, you can download the script from
`git.kernel.org <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/plain/tools/debugging/kernel-chktaint>`_
and execute it with ``sh kernel-chktaint``, which would print something like
this on the machine that had the statements in the logs that were quoted earlier::
diff --git a/Documentation/admin-guide/verify-bugs-and-bisect-regressions.rst b/Documentation/admin-guide/verify-bugs-and-bisect-regressions.rst
new file mode 100644
index 0000000..d350482
--- /dev/null
+++ b/Documentation/admin-guide/verify-bugs-and-bisect-regressions.rst
@@ -0,0 +1,1985 @@
+.. SPDX-License-Identifier: (GPL-2.0+ OR CC-BY-4.0)
+.. [see the bottom of this file for redistribution information]
+
+=========================================
+How to verify bugs and bisect regressions
+=========================================
+
+This document describes how to check if some Linux kernel problem occurs in code
+currently supported by developers -- to then explain how to locate the change
+causing the issue, if it is a regression (e.g. did not happen with earlier
+versions).
+
+The text aims at people running kernels from mainstream Linux distributions on
+commodity hardware who want to report a kernel bug to the upstream Linux
+developers. Despite this intent, the instructions work just as well for users
+who are already familiar with building their own kernels: they help avoid
+mistakes occasionally made even by experienced developers.
+
+..
+ Note: if you see this note, you are reading the text's source file. You
+ might want to switch to a rendered version: it makes it a lot easier to
+ read and navigate this document -- especially when you want to look something
+ up in the reference section, then jump back to where you left off.
+..
+ Find the latest rendered version of this text here:
+ https://docs.kernel.org/admin-guide/verify-bugs-and-bisect-regressions.rst.html
+
+The essence of the process (aka 'TL;DR')
+========================================
+
+*[If you are new to building or bisecting Linux, ignore this section and head
+over to the* ":ref:`step-by-step guide<introguide_bissbs>`" *below. It utilizes
+the same commands as this section while describing them in brief fashion. The
+steps are nevertheless easy to follow and together with accompanying entries
+in a reference section mention many alternatives, pitfalls, and additional
+aspects, all of which might be essential in your present case.]*
+
+**In case you want to check if a bug is present in code currently supported by
+developers**, execute just the *preparations* and *segment 1*; while doing so,
+consider the newest Linux kernel you regularly use to be the 'working' kernel.
+In the following example that's assumed to be 6.0.13, which is why the sources
+of 6.0 will be used to prepare the .config file.
+
+**In case you face a regression**, follow the steps at least till the end of
+*segment 2*. Then you can submit a preliminary report -- or continue with
+*segment 3*, which describes how to perform a bisection needed for a
+full-fledged regression report. In the following example 6.0.13 is assumed to be
+the 'working' kernel and 6.1.5 to be the first 'broken', which is why 6.0
+will be considered the 'good' release and used to prepare the .config file.
+
+* **Preparations**: set up everything to build your own kernels::
+
+ # * Remove any software that depends on externally maintained kernel modules
+ # or builds any automatically during bootup.
+ # * Ensure Secure Boot permits booting self-compiled Linux kernels.
+ # * If you are not already running the 'working' kernel, reboot into it.
+ # * Install compilers and everything else needed for building Linux.
+ # * Ensure to have 15 Gigabyte free space in your home directory.
+ git clone -o mainline --no-checkout \
+ https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git ~/linux/
+ cd ~/linux/
+ git remote add -t master stable \
+ https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
+ git checkout --detach v6.0
+ # * Hint: if you used an existing clone, ensure no stale .config is around.
+ make olddefconfig
+ # * Ensure the former command picked the .config of the 'working' kernel.
+ # * Connect external hardware (USB keys, tokens, ...), start a VM, bring up
+ # VPNs, mount network shares, and briefly try the feature that is broken.
+ yes '' | make localmodconfig
+ ./scripts/config --set-str CONFIG_LOCALVERSION '-local'
+ ./scripts/config -e CONFIG_LOCALVERSION_AUTO
+ # * Note, when short on storage space, check the guide for an alternative:
+ ./scripts/config -d DEBUG_INFO_NONE -e KALLSYMS_ALL -e DEBUG_KERNEL \
+ -e DEBUG_INFO -e DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT -e KALLSYMS
+ # * Hint: at this point you might want to adjust the build configuration;
+ # you'll have to, if you are running Debian.
+ make olddefconfig
+ cp .config ~/kernel-config-working
+
+* **Segment 1**: build a kernel from the latest mainline codebase.
+
+ This among others checks if the problem was fixed already and which developers
+ later need to be told about the problem; in case of a regression, this rules
+ out a .config change as root of the problem.
+
+ a) Checking out latest mainline code::
+
+ cd ~/linux/
+ git checkout --force --detach mainline/master
+
+ b) Build, install, and boot a kernel::
+
+ cp ~/kernel-config-working .config
+ make olddefconfig
+ make -j $(nproc --all)
+ # * Make sure there is enough disk space to hold another kernel:
+ df -h /boot/ /lib/modules/
+ # * Note: on Arch Linux, its derivatives and a few other distributions
+ # the following commands will do nothing at all or only part of the
+ # job. See the step-by-step guide for further details.
+ sudo make modules_install
+ command -v installkernel && sudo make install
+ # * Check how much space your self-built kernel actually needs, which
+ # enables you to make better estimates later:
+ du -ch /boot/*$(make -s kernelrelease)* | tail -n 1
+ du -sh /lib/modules/$(make -s kernelrelease)/
+ # * Hint: the output of the following command will help you pick the
+ # right kernel from the boot menu:
+ make -s kernelrelease | tee -a ~/kernels-built
+ reboot
+ # * Once booted, ensure you are running the kernel you just built by
+ # checking if the output of the next two commands matches:
+ tail -n 1 ~/kernels-built
+ uname -r
+ cat /proc/sys/kernel/tainted
+
+ c) Check if the problem occurs with this kernel as well.
+
+* **Segment 2**: ensure the 'good' kernel is also a 'working' kernel.
+
+ This among others verifies the trimmed .config file actually works well, as
+ bisecting with it otherwise would be a waste of time:
+
+ a) Start by checking out the sources of the 'good' version::
+
+ cd ~/linux/
+ git checkout --force --detach v6.0
+
+ b) Build, install, and boot a kernel as described earlier in *segment 1,
+ section b* -- just feel free to skip the 'du' commands, as you have a rough
+ estimate already.
+
+ c) Ensure the feature that regressed with the 'broken' kernel actually works
+ with this one.
+
+* **Segment 3**: perform and validate the bisection.
+
+ a) In case your 'broken' version is a stable/longterm release, add the Git
+ branch holding it::
+
+ git remote set-branches --add stable linux-6.1.y
+ git fetch stable
+
+ b) Initialize the bisection::
+
+ cd ~/linux/
+ git bisect start
+ git bisect good v6.0
+ git bisect bad v6.1.5
+
+ c) Build, install, and boot a kernel as described earlier in *segment 1,
+ section b*.
+
+ In case building or booting the kernel fails for unrelated reasons, run
+ ``git bisect skip``. In all other outcomes, check if the regressed feature
+ works with the newly built kernel. If it does, tell Git by executing
+ ``git bisect good``; if it does not, run ``git bisect bad`` instead.
+
+ All three commands will make Git checkout another commit; then re-execute
+ this step (e.g. build, install, boot, and test a kernel to then tell Git
+ the outcome). Do so again and again until Git shows which commit broke
+ things. If you run short of disk space during this process, check the
+ "Supplementary tasks" section below.
+
+ d) Once your finished the bisection, put a few things away::
+
+ cd ~/linux/
+ git bisect log > ~/bisect-log
+ cp .config ~/bisection-config-culprit
+ git bisect reset
+
+ e) Try to verify the bisection result::
+
+ git checkout --force --detach mainline/master
+ git revert --no-edit cafec0cacaca0
+
+ This is optional, as some commits are impossible to revert. But if the
+ second command worked flawlessly, build, install, and boot one more kernel
+ kernel, which should not show the regression.
+
+* **Supplementary tasks**: cleanup during and after the process.
+
+ a) To avoid running out of disk space during a bisection, you might need to
+ remove some kernels you built earlier. You most likely want to keep those
+ you built during segment 1 and 2 around for a while, but you will most
+ likely no longer need kernels tested during the actual bisection
+ (Segment 3 c). You can list them in build order using::
+
+ ls -ltr /lib/modules/*-local*
+
+ To then for example erase a kernel that identifies itself as
+ '6.0-rc1-local-gcafec0cacaca0', use this::
+
+ sudo rm -rf /lib/modules/6.0-rc1-local-gcafec0cacaca0
+ sudo kernel-install -v remove 6.0-rc1-local-gcafec0cacaca0
+ # * Note, on some distributions kernel-install is missing
+ # or does only part of the job.
+
+ b) If you performed a bisection and successfully validated the result, feel
+ free to remove all kernels built during the actual bisection (Segment 3 c);
+ the kernels you built earlier and later you might want to keep around for
+ a week or two.
+
+.. _introguide_bissbs:
+
+Step-by-step guide on how to verify bugs and bisect regressions
+===============================================================
+
+This guide describes how to set up your own Linux kernels for investigating bugs
+or regressions you intent to report. How far you want to follow the instructions
+depends on your issue:
+
+Execute all steps till the end of *segment 1* to **verify if your kernel problem
+is present in code supported by Linux kernel developers**. If it is, you are all
+set to report the bug -- unless it did not happen with earlier kernel versions,
+as then your want to at least continue with *segment 2* to **check if the issue
+qualifies as regression** which receive priority treatment. Depending on the
+outcome you then are ready to report a bug or submit a preliminary regression
+report; instead of the latter your could also head straight on and follow
+*segment 3* to **perform a bisection** for a full-fledged regression report
+developers are obliged to act upon.
+
+ :ref:`Preparations: set up everything to build your own kernels.<introprep_bissbs>`
+
+ :ref:`Segment 1: try to reproduce the problem with the latest codebase.<introlatestcheck_bissbs>`
+
+ :ref:`Segment 2: check if the kernels you build work fine.<introworkingcheck_bissbs>`
+
+ :ref:`Segment 3: perform a bisection and validate the result.<introbisect_bissbs>`
+
+ :ref:`Supplementary tasks: cleanup during and after following this guide.<introclosure_bissbs>`
+
+The steps in each segment illustrate the important aspects of the process, while
+a comprehensive reference section holds additional details for almost all of the
+steps. The reference section sometimes also outlines alternative approaches,
+pitfalls, as well as problems that might occur at the particular step -- and how
+to get things rolling again.
+
+For further details on how to report Linux kernel issues or regressions check
+out Documentation/admin-guide/reporting-issues.rst, which works in conjunction
+with this document. It among others explains why you need to verify bugs with
+the latest 'mainline' kernel, even if you face a problem with a kernel from a
+'stable/longterm' series; for users facing a regression it also explains that
+sending a preliminary report after finishing segment 2 might be wise, as the
+regression and its culprit might be known already. For further details on
+what actually qualifies as a regression check out
+Documentation/admin-guide/reporting-regressions.rst.
+
+.. _introprep_bissbs:
+
+Preparations: set up everything to build your own kernels
+---------------------------------------------------------
+
+.. _backup_bissbs:
+
+* Create a fresh backup and put system repair and restore tools at hand, just
+ to be prepared for the unlikely case of something going sideways.
+
+ [:ref:`details<backup_bisref>`]
+
+.. _vanilla_bissbs:
+
+* Remove all software that depends on externally developed kernel drivers or
+ builds them automatically. That includes but is not limited to DKMS, openZFS,
+ VirtualBox, and Nvidia's graphics drivers (including the GPLed kernel module).
+
+ [:ref:`details<vanilla_bisref>`]
+
+.. _secureboot_bissbs:
+
+* On platforms with 'Secure Boot' or similar solutions, prepare everything to
+ ensure the system will permit your self-compiled kernel to boot. The
+ quickest and easiest way to achieve this on commodity x86 systems is to
+ disable such techniques in the BIOS setup utility; alternatively, remove
+ their restrictions through a process initiated by
+ ``mokutil --disable-validation``.
+
+ [:ref:`details<secureboot_bisref>`]
+
+.. _rangecheck_bissbs:
+
+* Determine the kernel versions considered 'good' and 'bad' throughout this
+ guide.
+
+ Do you follow this guide to verify if a bug is present in the code developers
+ care for? Then consider the mainline release your 'working' kernel (the newest
+ one you regularly use) is based on to be the 'good' version; if your 'working'
+ kernel for example is 6.0.11, then your 'good' kernel is 6.0.
+
+ In case you face a regression, it depends on the version range where the
+ regression was introduced:
+
+ * Something which used to work in Linux 6.0 broke when switching to Linux
+ 6.1-rc1? Then henceforth regard 6.0 as the last known 'good' version
+ and 6.1-rc1 as the first 'bad' one.
+
+ * Some function stopped working when updating from 6.0.11 to 6.1.4? Then for
+ the time being consider 6.0 as the last 'good' version and 6.1.4 as
+ the 'bad' one. Note, at this point it is merely assumed that 6.0 is fine;
+ this assumption will be checked in segment 2.
+
+ * A feature you used in 6.0.11 does not work at all or worse in 6.1.13? In
+ that case you want to bisect within a stable/longterm series: consider
+ 6.0.11 as the last known 'good' version and 6.0.13 as the first 'bad'
+ one. Note, in this case you still want to compile and test a mainline kernel
+ as explained in segment 1: the outcome will determine if you need to report
+ your issue to the regular developers or the stable team.
+
+ *Note, do not confuse 'good' version with 'working' kernel; the latter term
+ throughout this guide will refer to the last kernel that has been working
+ fine.*
+
+ [:ref:`details<rangecheck_bisref>`]
+
+.. _bootworking_bissbs:
+
+* Boot into the 'working' kernel and briefly use the apparently broken feature.
+
+ [:ref:`details<bootworking_bisref>`]
+
+.. _diskspace_bissbs:
+
+* Ensure to have enough free space for building Linux. 15 Gigabyte in your home
+ directory should typically suffice. If you have less available, be sure to pay
+ attention to later steps about retrieving the Linux sources and handling of
+ debug symbols: both explain approaches reducing the amount of space, which
+ should allow you to master these tasks with about 4 Gigabytes free space.
+
+ [:ref:`details<diskspace_bisref>`]
+
+.. _buildrequires_bissbs:
+
+* Install all software required to build a Linux kernel. Often you will need:
+ 'bc', 'binutils' ('ld' et al.), 'bison', 'flex', 'gcc', 'git', 'openssl',
+ 'pahole', 'perl', and the development headers for 'libelf' and 'openssl'. The
+ reference section shows how to quickly install those on various popular Linux
+ distributions.
+
+ [:ref:`details<buildrequires_bisref>`]
+
+.. _sources_bissbs:
+
+* Retrieve the mainline Linux sources; then change into the directory holding
+ them, as all further commands in this guide are meant to be executed from
+ there.
+
+ *Note, the following describe how to retrieve the sources using a full
+ mainline clone, which downloads about 2,75 GByte as of early 2024. The*
+ :ref:`reference section describes two alternatives <sources_bisref>` *:
+ one downloads less than 500 MByte, the other works better with unreliable
+ internet connections.*
+
+ Execute the following command to retrieve a fresh mainline codebase while
+ preparing things to add branches for stable/longterm series later::
+
+ git clone -o mainline --no-checkout \
+ https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git ~/linux/
+ cd ~/linux/
+ git remote add -t master stable \
+ https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
+
+ [:ref:`details<sources_bisref>`]
+
+.. _oldconfig_bissbs:
+
+* Start preparing a kernel build configuration (the '.config' file).
+
+ Before doing so, ensure you are still running the 'working' kernel an earlier
+ step told you to boot; if you are unsure, check the current kernel release
+ identifier using ``uname -r``.
+
+ Afterwards check out the source code for the version earlier established as
+ 'good'. In the following example command this is assumed to be 6.0; note that
+ the version number in this and all later Git commands needs to be prefixed
+ with a 'v'::
+
+ git checkout --detach v6.0
+
+ Now create a build configuration file::
+
+ make olddefconfig
+
+ The kernel build scripts then will try to locate the build configuration file
+ for the running kernel and then adjust it for the needs of the kernel sources
+ you checked out. While doing so, it will print a few lines you need to check.
+
+ Look out for a line starting with '# using defaults found in'. It should be
+ followed by a path to a file in '/boot/' that contains the release identifier
+ of your currently working kernel. If the line instead continues with something
+ like 'arch/x86/configs/x86_64_defconfig', then the build infra failed to find
+ the .config file for your running kernel -- in which case you have to put one
+ there manually, as explained in the reference section.
+
+ In case you can not find such a line, look for one containing '# configuration
+ written to .config'. If that's the case you have a stale build configuration
+ lying around. Unless you intend to use it, delete it; afterwards run
+ 'make olddefconfig' again and check if it now picked up the right config file
+ as base.
+
+ [:ref:`details<oldconfig_bisref>`]
+
+.. _localmodconfig_bissbs:
+
+* Disable any kernel modules apparently superfluous for your setup. This is
+ optional, but especially wise for bisections, as it speeds up the build
+ process enormously -- at least unless the .config file picked up in the
+ previous step was already tailored to your and your hardware needs, in which
+ case you should skip this step.
+
+ To prepare the trimming, connect external hardware you occasionally use (USB
+ keys, tokens, ...), quickly start a VM, and bring up VPNs. And if you rebooted
+ since you started that guide, ensure that you tried using the feature causing
+ trouble since you started the system. Only then trim your .config::
+
+ yes '' | make localmodconfig
+
+ There is a catch to this, as the 'apparently' in initial sentence of this step
+ and the preparation instructions already hinted at:
+
+ The 'localmodconfig' target easily disables kernel modules for features only
+ used occasionally -- like modules for external peripherals not yet connected
+ since booting, virtualization software not yet utilized, VPN tunnels, and a
+ few other things. That's because some tasks rely on kernel modules Linux only
+ loads when you execute tasks like the aforementioned ones for the first time.
+
+ This drawback of localmodconfig is nothing you should lose sleep over, but
+ something to keep in mind: if something is misbehaving with the kernels built
+ during this guide, this is most likely the reason. You can reduce or nearly
+ eliminate the risk with tricks outlined in the reference section; but when
+ building a kernel just for quick testing purposes this is usually not worth
+ spending much effort on, as long as it boots and allows to properly test the
+ feature that causes trouble.
+
+ [:ref:`details<localmodconfig_bisref>`]
+
+.. _tagging_bissbs:
+
+* Ensure all the kernels you will build are clearly identifiable using a special
+ tag and a unique version number::
+
+ ./scripts/config --set-str CONFIG_LOCALVERSION '-local'
+ ./scripts/config -e CONFIG_LOCALVERSION_AUTO
+
+ [:ref:`details<tagging_bisref>`]
+
+.. _debugsymbols_bissbs:
+
+* Decide how to handle debug symbols.
+
+ In the context of this document it is often wise to enable them, as there is a
+ decent chance you will need to decode a stack trace from a 'panic', 'Oops',
+ 'warning', or 'BUG'::
+
+ ./scripts/config -d DEBUG_INFO_NONE -e KALLSYMS_ALL -e DEBUG_KERNEL \
+ -e DEBUG_INFO -e DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT -e KALLSYMS
+
+ But if you are extremely short on storage space, you might want to disable
+ debug symbols instead::
+
+ ./scripts/config -d DEBUG_INFO -d DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT \
+ -d DEBUG_INFO_DWARF4 -d DEBUG_INFO_DWARF5 -e CONFIG_DEBUG_INFO_NONE
+
+ [:ref:`details<debugsymbols_bisref>`]
+
+.. _configmods_bissbs:
+
+* Check if you may want or need to adjust some other kernel configuration
+ options:
+
+ * Are you running Debian? Then you want to avoid known problems by performing
+ additional adjustments explained in the reference section.
+
+ [:ref:`details<configmods_distros_bisref>`].
+
+ * If you want to influence other aspects of the configuration, do so now using
+ your preferred tool. Note, to use make targets like 'menuconfig' or
+ 'nconfig', you will need to install the development files of ncurses; for
+ 'xconfig' you likewise need the Qt5 or Qt6 headers.
+
+ [:ref:`details<configmods_individual_bisref>`].
+
+.. _saveconfig_bissbs:
+
+* Reprocess the .config after the latest adjustments and store it in a safe
+ place::
+
+ make olddefconfig
+ cp .config ~/kernel-config-working
+
+ [:ref:`details<saveconfig_bisref>`]
+
+.. _introlatestcheck_bissbs:
+
+Segment 1: try to reproduce the problem with the latest codebase
+----------------------------------------------------------------
+
+The following steps verify if the problem occurs with the code currently
+supported by developers. In case you face a regression, it also checks that the
+problem is not caused by some .config change, as reporting the issue then would
+be a waste of time. [:ref:`details<introlatestcheck_bisref>`]
+
+.. _checkoutmaster_bissbs:
+
+* Check out the latest Linux codebase::
+
+ cd ~/linux/
+ git checkout --force --detach mainline/master
+
+ [:ref:`details<checkoutmaster_bisref>`]
+
+.. _build_bissbs:
+
+* Build the image and the modules of your first kernel using the config file you
+ prepared::
+
+ cp ~/kernel-config-working .config
+ make olddefconfig
+ make -j $(nproc --all)
+
+ If you want your kernel packaged up as deb, rpm, or tar file, see the
+ reference section for alternatives, which obviously will require other
+ steps to install as well.
+
+ [:ref:`details<build_bisref>`]
+
+.. _install_bissbs:
+
+* Install your newly built kernel.
+
+ Before doing so, consider checking if there is still enough space for it::
+
+ df -h /boot/ /lib/modules/
+
+ For now assume 150 MByte in /boot/ and 200 in /lib/modules/ will suffice; how
+ much your kernels actually require will be determined later during this guide.
+
+ Now install the kernel's modules and its image, which will be stored in
+ parallel to the your Linux distribution's kernels::
+
+ sudo make modules_install
+ command -v installkernel && sudo make install
+
+ The second command ideally will take care of three steps required at this
+ point: copying the kernel's image to /boot/, generating an initramfs, and
+ adding an entry for both to the boot loader's configuration.
+
+ Sadly some distributions (among them Arch Linux, its derivatives, and many
+ immutable Linux distributions) will perform none or only some of those tasks.
+ You therefore want to check if all of them were taken care of and manually
+ perform those that were not. The reference section provides further details on
+ that; your distribution's documentation might help, too.
+
+ Once you figured out the steps needed at this point, consider writing them
+ down: if you will build more kernels as described in segment 2 and 3, you will
+ have to perform those again after executing ``command -v installkernel [...]``.
+
+ [:ref:`details<install_bisref>`]
+
+.. _storagespace_bissbs:
+
+* In case you plan to follow this guide further, check how much storage space
+ the kernel, its modules, and other related files like the initramfs consume::
+
+ du -ch /boot/*$(make -s kernelrelease)* | tail -n 1
+ du -sh /lib/modules/$(make -s kernelrelease)/
+
+ Write down or remember those two values for later: they enable you to prevent
+ running out of disk space accidentally during a bisection.
+
+ [:ref:`details<storagespace_bisref>`]
+
+.. _kernelrelease_bissbs:
+
+* Show and store the kernelrelease identifier of the kernel you just built::
+
+ make -s kernelrelease | tee -a ~/kernels-built
+
+ Remember the identifier momentarily, as it will help you pick the right kernel
+ from the boot menu upon restarting.
+
+* Reboot into your newly built kernel. To ensure your actually started the one
+ you just built, you might want to verify if the output of these commands
+ matches::
+
+ tail -n 1 ~/kernels-built
+ uname -r
+
+.. _tainted_bissbs:
+
+* Check if the kernel marked itself as 'tainted'::
+
+ cat /proc/sys/kernel/tainted
+
+ If that command does not return '0', check the reference section, as the cause
+ for this might interfere with your testing.
+
+ [:ref:`details<tainted_bisref>`]
+
+.. _recheckbroken_bissbs:
+
+* Verify if your bug occurs with the newly built kernel. If it does not, check
+ out the instructions in the reference section to ensure nothing went sideways
+ during your tests.
+
+ [:ref:`details<recheckbroken_bisref>`]
+
+.. _recheckstablebroken_bissbs:
+
+* Are you facing a problem within a stable/longterm series, but failed to
+ reproduce it with the mainline kernel you just built? One that according to
+ the `front page of kernel.org <https://kernel.org/>`_ is still supported? Then
+ check if the latest codebase for the particular series might already fix the
+ problem. To do so, add the stable series Git branch for your 'good' kernel
+ (again, this here is assumed to be 6.0) and check out the latest version::
+
+ cd ~/linux/
+ git remote set-branches --add stable linux-6.0.y
+ git fetch stable
+ git checkout --force --detach linux-6.0.y
+
+ Now use the checked out code to build and install another kernel using the
+ commands the earlier steps already described in more detail::
+
+ cp ~/kernel-config-working .config
+ make olddefconfig
+ make -j $(nproc --all)
+ # * Check if the free space suffices holding another kernel:
+ df -h /boot/ /lib/modules/
+ sudo make modules_install
+ command -v installkernel && sudo make install
+ make -s kernelrelease | tee -a ~/kernels-built
+ reboot
+
+ Confirm you booted the kernel you intended to start and check its tainted
+ status::
+
+ tail -n 1 ~/kernels-built
+ uname -r
+ cat /proc/sys/kernel/tainted
+
+ Now verify if this kernel is showing the problem.
+
+ [:ref:`details<recheckstablebroken_bisref>`]
+
+Do you follow this guide to verify if a problem is present in the code
+currently supported by Linux kernel developers? Then you are done at this
+point. If you later want to remove the kernel you just built, check out
+:ref:`Supplementary tasks: cleanup during and after following this guide<introclosure_bissbs>`.
+
+In case you face a regression, move on and execute at least the next segment
+as well.
+
+.. _introworkingcheck_bissbs:
+
+Segment 2: check if the kernels you build work fine
+---------------------------------------------------
+
+In case of a regression, you now want to ensure the trimmed configuration file
+you created earlier works as expected; a bisection with the .config file
+otherwise would be a waste of time. [:ref:`details<introworkingcheck_bisref>`]
+
+.. _recheckworking_bissbs:
+
+* Build your own variant of the 'working' kernel and check if the feature that
+ regressed works as expected with it.
+
+ Start by checking out the sources for the version earlier established as
+ 'good' (once again assumed to be 6.0 here)::
+
+ cd ~/linux/
+ git checkout --detach v6.0
+
+ Now use the checked out code to configure, build, and install another kernel
+ using the commands the previous subsection explained in more detail::
+
+ cp ~/kernel-config-working .config
+ make olddefconfig
+ make -j $(nproc --all)
+ # * Check if the free space suffices holding another kernel:
+ df -h /boot/ /lib/modules/
+ sudo make modules_install
+ command -v installkernel && sudo make install
+ make -s kernelrelease | tee -a ~/kernels-built
+ reboot
+
+ When the system booted, you may want to verify once again that the
+ kernel you started is the one you just built::
+
+ tail -n 1 ~/kernels-built
+ uname -r
+
+ Now check if this kernel works as expected; if not, consult the reference
+ section for further instructions.
+
+ [:ref:`details<recheckworking_bisref>`]
+
+.. _introbisect_bissbs:
+
+Segment 3: perform the bisection and validate the result
+--------------------------------------------------------
+
+With all the preparations and precaution builds taken care of, you are now ready
+to begin the bisection. This will make you build quite a few kernels -- usually
+about 15 in case you encountered a regression when updating to a newer series
+(say from 6.0.11 to 6.1.3). But do not worry, due to the trimmed build
+configuration created earlier this works a lot faster than many people assume:
+overall on average it will often just take about 10 to 15 minutes to compile
+each kernel on commodity x86 machines.
+
+* In case your 'bad' version is a stable/longterm release (say 6.1.5), add its
+ stable branch, unless you already did so earlier::
+
+ cd ~/linux/
+ git remote set-branches --add stable linux-6.1.y
+ git fetch stable
+
+.. _bisectstart_bissbs:
+
+* Start the bisection and tell Git about the versions earlier established as
+ 'good' (6.0 in the following example command) and 'bad' (6.1.5)::
+
+ cd ~/linux/
+ git bisect start
+ git bisect good v6.0
+ git bisect bad v6.1.5
+
+ [:ref:`details<bisectstart_bisref>`]
+
+.. _bisectbuild_bissbs:
+
+* Now use the code Git checked out to build, install, and boot a kernel using
+ the commands introduced earlier::
+
+ cp ~/kernel-config-working .config
+ make olddefconfig
+ make -j $(nproc --all)
+ # * Check if the free space suffices holding another kernel:
+ df -h /boot/ /lib/modules/
+ sudo make modules_install
+ command -v installkernel && sudo make install
+ make -s kernelrelease | tee -a ~/kernels-built
+ reboot
+
+ If compilation fails for some reason, run ``git bisect skip`` and restart
+ executing the stack of commands from the beginning.
+
+ In case you skipped the "test latest codebase" step in the guide, check its
+ description as for why the 'df [...]' and 'make -s kernelrelease [...]'
+ commands are here.
+
+ Important note: the latter command from this point on will print release
+ identifiers that might look odd or wrong to you -- which they are not, as it's
+ totally normal to see release identifiers like '6.0-rc1-local-gcafec0cacaca0'
+ if you bisect between versions 6.1 and 6.2 for example.
+
+ [:ref:`details<bisectbuild_bisref>`]
+
+.. _bisecttest_bissbs:
+
+* Now check if the feature that regressed works in the kernel you just built.
+
+ You again might want to start by making sure the kernel you booted is the one
+ you just built::
+
+ cd ~/linux/
+ tail -n 1 ~/kernels-built
+ uname -r
+
+ Now verify if the feature that regressed works at this kernel bisection point.
+ If it does, run this::
+
+ git bisect good
+
+ If it does not, run this::
+
+ git bisect bad
+
+ Be sure about what you tell Git, as getting this wrong just once will send the
+ rest of the bisection totally off course.
+
+ While the bisection is ongoing, Git will use the information you provided to
+ find and check out another bisection point for you to test. While doing so, it
+ will print something like 'Bisecting: 675 revisions left to test after this
+ (roughly 10 steps)' to indicate how many further changes it expects to be
+ tested. Now build and install another kernel using the instructions from the
+ previous step; afterwards follow the instructions in this step again.
+
+ Repeat this again and again until you finish the bisection -- that's the case
+ when Git after tagging a change as 'good' or 'bad' prints something like
+ 'cafecaca0c0dacafecaca0c0dacafecaca0c0da is the first bad commit'; right
+ afterwards it will show some details about the culprit including the patch
+ description of the change. The latter might fill your terminal screen, so you
+ might need to scroll up to see the message mentioning the culprit;
+ alternatively, run ``git bisect log > ~/bisection-log``.
+
+ [:ref:`details<bisecttest_bisref>`]
+
+.. _bisectlog_bissbs:
+
+* Store Git's bisection log and the current .config file in a safe place before
+ telling Git to reset the sources to the state before the bisection::
+
+ cd ~/linux/
+ git bisect log > ~/bisection-log
+ cp .config ~/bisection-config-culprit
+ git bisect reset
+
+ [:ref:`details<bisectlog_bisref>`]
+
+.. _revert_bissbs:
+
+* Try reverting the culprit on top of latest mainline to see if this fixes your
+ regression.
+
+ This is optional, as it might be impossible or hard to realize. The former is
+ the case, if the bisection determined a merge commit as the culprit; the
+ latter happens if other changes depend on the culprit. But if the revert
+ succeeds, it is worth building another kernel, as it validates the result of
+ a bisection, which can easily deroute; it furthermore will let kernel
+ developers know, if they can resolve the regression with a quick revert.
+
+ Begin by checking out the latest codebase depending on the range you bisected:
+
+ * Did you face a regression within a stable/longterm series (say between
+ 6.0.11 and 6.0.13) that does not happen in mainline? Then check out the
+ latest codebase for the affected series like this::
+
+ git fetch stable
+ git checkout --force --detach linux-6.0.y
+
+ * In all other cases check out latest mainline::
+
+ git fetch mainline
+ git checkout --force --detach mainline/master
+
+ If you bisected a regression within a stable/longterm series that also
+ happens in mainline, there is one more thing to do: look up the mainline
+ commit-id. To do so, use a command like ``git show abcdcafecabcd`` to
+ view the patch description of the culprit. There will be a line near
+ the top which looks like 'commit cafec0cacaca0 upstream.' or
+ 'Upstream commit cafec0cacaca0'; use that commit-id in the next command
+ and not the one the bisection blamed.
+
+ Now try reverting the culprit by specifying its commit id::
+
+ git revert --no-edit cafec0cacaca0
+
+ If that fails, give up trying and move on to the next step. But if it works,
+ build a kernel again using the familiar command sequence::
+
+ cp ~/kernel-config-working .config
+ make olddefconfig &&
+ make -j $(nproc --all) &&
+ # * Check if the free space suffices holding another kernel:
+ df -h /boot/ /lib/modules/
+ sudo make modules_install
+ command -v installkernel && sudo make install
+ Make -s kernelrelease | tee -a ~/kernels-built
+ reboot
+
+ Now check one last time if the feature that made you perform a bisection work
+ with that kernel.
+
+ [:ref:`details<revert_bisref>`]
+
+.. _introclosure_bissbs:
+
+Supplementary tasks: cleanup during and after the bisection
+-----------------------------------------------------------
+
+During and after following this guide you might want or need to remove some of
+the kernels you installed: the boot menu otherwise will become confusing or
+space might run out.
+
+.. _makeroom_bissbs:
+
+* To remove one of the kernels you installed, look up its 'kernelrelease'
+ identifier. This guide stores them in '~/kernels-built', but the following
+ command will print them as well::
+
+ ls -ltr /lib/modules/*-local*
+
+ You in most situations want to remove the oldest kernels built during the
+ actual bisection (e.g. segment 3 of this guide). The two ones you created
+ beforehand (e.g. to test the latest codebase and the version considered
+ 'good') might become handy to verify something later -- thus better keep them
+ around, unless you are really short on storage space.
+
+ To remove the modules of a kernel with the kernelrelease identifier
+ '*6.0-rc1-local-gcafec0cacaca0*', start by removing the directory holding its
+ modules::
+
+ sudo rm -rf /lib/modules/6.0-rc1-local-gcafec0cacaca0
+
+ Afterwards try the following command::
+
+ sudo kernel-install -v remove 6.0-rc1-local-gcafec0cacaca0
+
+ On quite a few distributions this will delete all other kernel files installed
+ while also removing the kernel's entry from the boot menu. But on some
+ distributions kernel-install does not exist or leaves boot-loader entries or
+ kernel image and related files behind; in that case remove them as described
+ in the reference section.
+
+ [:ref:`details<makeroom_bisref>`]
+
+.. _finishingtouch_bissbs:
+
+* Once you have finished the bisection, do not immediately remove anything you
+ set up, as you might need a few things again. What is safe to remove depends
+ on the outcome of the bisection:
+
+ * Could you initially reproduce the regression with the latest codebase and
+ after the bisection were able to fix the problem by reverting the culprit on
+ top of the latest codebase? Then you want to keep those two kernels around
+ for a while, but safely remove all others with a '-local' in the release
+ identifier.
+
+ * Did the bisection end on a merge-commit or seems questionable for other
+ reasons? Then you want to keep as many kernels as possible around for a few
+ days: it's pretty likely that you will be asked to recheck something.
+
+ * In other cases it likely is a good idea to keep the following kernels around
+ for some time: the one built from the latest codebase, the one created from
+ the version considered 'good', and the last three or four you compiled
+ during the actual bisection process.
+
+ [:ref:`details<finishingtouch_bisref>`]
+
+.. _submit_improvements:
+
+This concludes the step-by-step guide.
+
+Did you run into trouble following any of the above steps not cleared up by the
+reference section below? Did you spot errors? Or do you have ideas how to
+improve the guide? Then please take a moment and let the maintainer of this
+document know by email (Thorsten Leemhuis <linux@leemhuis.info>), ideally while
+CCing the Linux docs mailing list (linux-doc@vger.kernel.org). Such feedback is
+vital to improve this document further, which is in everybody's interest, as it
+will enable more people to master the task described here -- and hopefully also
+improve similar guides inspired by this one.
+
+
+Reference section for the step-by-step guide
+============================================
+
+This section holds additional information for almost all the items in the above
+step-by-step guide.
+
+.. _backup_bisref:
+
+Prepare for emergencies
+-----------------------
+
+ *Create a fresh backup and put system repair and restore tools at hand.*
+ [:ref:`... <backup_bissbs>`]
+
+Remember, you are dealing with computers, which sometimes do unexpected things
+-- especially if you fiddle with crucial parts like the kernel of an operating
+system. That's what you are about to do in this process. Hence, better prepare
+for something going sideways, even if that should not happen.
+
+[:ref:`back to step-by-step guide <backup_bissbs>`]
+
+.. _vanilla_bisref:
+
+Remove anything related to externally maintained kernel modules
+---------------------------------------------------------------
+
+ *Remove all software that depends on externally developed kernel drivers or
+ builds them automatically.* [:ref:`...<vanilla_bissbs>`]
+
+Externally developed kernel modules can easily cause trouble during a bisection.
+
+But there is a more important reason why this guide contains this step: most
+kernel developers will not care about reports about regressions occurring with
+kernels that utilize such modules. That's because such kernels are not
+considered 'vanilla' anymore, as Documentation/admin-guide/reporting-issues.rst
+explains in more detail.
+
+[:ref:`back to step-by-step guide <vanilla_bissbs>`]
+
+.. _secureboot_bisref:
+
+Deal with techniques like Secure Boot
+-------------------------------------
+
+ *On platforms with 'Secure Boot' or similar techniques, prepare everything to
+ ensure the system will permit your self-compiled kernel to boot later.*
+ [:ref:`... <secureboot_bissbs>`]
+
+Many modern systems allow only certain operating systems to start; that's why
+they reject booting self-compiled kernels by default.
+
+You ideally deal with this by making your platform trust your self-built kernels
+with the help of a certificate. How to do that is not described
+here, as it requires various steps that would take the text too far away from
+its purpose; 'Documentation/admin-guide/module-signing.rst' and various web
+sides already explain everything needed in more detail.
+
+Temporarily disabling solutions like Secure Boot is another way to make your own
+Linux boot. On commodity x86 systems it is possible to do this in the BIOS Setup
+utility; the required steps vary a lot between machines and therefore cannot be
+described here.
+
+On mainstream x86 Linux distributions there is a third and universal option:
+disable all Secure Boot restrictions for your Linux environment. You can
+initiate this process by running ``mokutil --disable-validation``; this will
+tell you to create a one-time password, which is safe to write down. Now
+restart; right after your BIOS performed all self-tests the bootloader Shim will
+show a blue box with a message 'Press any key to perform MOK management'. Hit
+some key before the countdown exposes, which will open a menu. Choose 'Change
+Secure Boot state'. Shim's 'MokManager' will now ask you to enter three
+randomly chosen characters from the one-time password specified earlier. Once
+you provided them, confirm you really want to disable the validation.
+Afterwards, permit MokManager to reboot the machine.
+
+[:ref:`back to step-by-step guide <secureboot_bissbs>`]
+
+.. _bootworking_bisref:
+
+Boot the last kernel that was working
+-------------------------------------
+
+ *Boot into the last working kernel and briefly recheck if the feature that
+ regressed really works.* [:ref:`...<bootworking_bissbs>`]
+
+This will make later steps that cover creating and trimming the configuration do
+the right thing.
+
+[:ref:`back to step-by-step guide <bootworking_bissbs>`]
+
+.. _diskspace_bisref:
+
+Space requirements
+------------------
+
+ *Ensure to have enough free space for building Linux.*
+ [:ref:`... <diskspace_bissbs>`]
+
+The numbers mentioned are rough estimates with a big extra charge to be on the
+safe side, so often you will need less.
+
+If you have space constraints, be sure to hay attention to the :ref:`step about
+debug symbols' <debugsymbols_bissbs>` and its :ref:`accompanying reference
+section' <debugsymbols_bisref>`, as disabling then will reduce the consumed disk
+space by quite a few gigabytes.
+
+[:ref:`back to step-by-step guide <diskspace_bissbs>`]
+
+.. _rangecheck_bisref:
+
+Bisection range
+---------------
+
+ *Determine the kernel versions considered 'good' and 'bad' throughout this
+ guide.* [:ref:`...<rangecheck_bissbs>`]
+
+Establishing the range of commits to be checked is mostly straightforward,
+except when a regression occurred when switching from a release of one stable
+series to a release of a later series (e.g. from 6.0.11 to 6.1.4). In that case
+Git will need some hand holding, as there is no straight line of descent.
+
+That's because with the release of 6.0 mainline carried on to 6.1 while the
+stable series 6.0.y branched to the side. It's therefore theoretically possible
+that the issue you face with 6.1.4 only worked in 6.0.11, as it was fixed by a
+commit that went into one of the 6.0.y releases, but never hit mainline or the
+6.1.y series. Thankfully that normally should not happen due to the way the
+stable/longterm maintainers maintain the code. It's thus pretty safe to assume
+6.0 as a 'good' kernel. That assumption will be tested anyway, as that kernel
+will be built and tested in the segment '2' of this guide; Git would force you
+to do this as well, if you tried bisecting between 6.0.11 and 6.1.13.
+
+[:ref:`back to step-by-step guide <rangecheck_bissbs>`]
+
+.. _buildrequires_bisref:
+
+Install build requirements
+--------------------------
+
+ *Install all software required to build a Linux kernel.*
+ [:ref:`...<buildrequires_bissbs>`]
+
+The kernel is pretty stand-alone, but besides tools like the compiler you will
+sometimes need a few libraries to build one. How to install everything needed
+depends on your Linux distribution and the configuration of the kernel you are
+about to build.
+
+Here are a few examples what you typically need on some mainstream
+distributions:
+
+* Arch Linux and derivatives::
+
+ sudo pacman --needed -S bc binutils bison flex gcc git kmod libelf openssl \
+ pahole perl zlib ncurses qt6-base
+
+* Debian, Ubuntu, and derivatives::
+
+ sudo apt install bc binutils bison dwarves flex gcc git kmod libelf-dev \
+ libssl-dev make openssl pahole perl-base pkg-config zlib1g-dev \
+ libncurses-dev qt6-base-dev g++
+
+* Fedora and derivatives::
+
+ sudo dnf install binutils \
+ /usr/bin/{bc,bison,flex,gcc,git,openssl,make,perl,pahole,rpmbuild} \
+ /usr/include/{libelf.h,openssl/pkcs7.h,zlib.h,ncurses.h,qt6/QtGui/QAction}
+
+* openSUSE and derivatives::
+
+ sudo zypper install bc binutils bison dwarves flex gcc git \
+ kernel-install-tools libelf-devel make modutils openssl openssl-devel \
+ perl-base zlib-devel rpm-build ncurses-devel qt6-base-devel
+
+These commands install a few packages that are often, but not always needed. You
+for example might want to skip installing the development headers for ncurses,
+which you will only need in case you later might want to adjust the kernel build
+configuration using make the targets 'menuconfig' or 'nconfig'; likewise omit
+the headers of Qt6 is you do not plan to adjust the .config using 'xconfig'.
+
+You furthermore might need additional libraries and their development headers
+for tasks not covered in this guide -- for example when building utilities from
+the kernel's tools/ directory.
+
+[:ref:`back to step-by-step guide <buildrequires_bissbs>`]
+
+.. _sources_bisref:
+
+Download the sources using Git
+------------------------------
+
+ *Retrieve the Linux mainline sources.*
+ [:ref:`...<sources_bissbs>`]
+
+The step-by-step guide outlines how to download the Linux sources using a full
+Git clone of Linus' mainline repository. There is nothing more to say about
+that -- but there are two alternatives ways to retrieve the sources that might
+work better for you:
+
+* If you have an unreliable internet connection, consider
+ :ref:`using a 'Git bundle'<sources_bundle_bisref>`.
+
+* If downloading the complete repository would take too long or requires too
+ much storage space, consider :ref:`using a 'shallow
+ clone'<sources_shallow_bisref>`.
+
+.. _sources_bundle_bisref:
+
+Downloading Linux mainline sources using a bundle
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Use the following commands to retrieve the Linux mainline sources using a
+bundle::
+
+ wget -c \
+ https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/clone.bundle
+ git clone --no-checkout clone.bundle ~/linux/
+ cd ~/linux/
+ git remote remove origin
+ git remote add mainline \
+ https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
+ git fetch mainline
+ git remote add -t master stable \
+ https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
+
+In case the 'wget' command fails, just re-execute it, it will pick up where
+it left off.
+
+[:ref:`back to step-by-step guide <sources_bissbs>`]
+[:ref:`back to section intro <sources_bisref>`]
+
+.. _sources_shallow_bisref:
+
+Downloading Linux mainline sources using a shallow clone
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+First, execute the following command to retrieve the latest mainline codebase::
+
+ git clone -o mainline --no-checkout --depth 1 -b master \
+ https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git ~/linux/
+ cd ~/linux/
+ git remote add -t master stable \
+ https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
+
+Now deepen your clone's history to the second predecessor of the mainline
+release of your 'good' version. In case the latter are 6.0 or 6.0.11, 5.19 would
+be the first predecessor and 5.18 the second -- hence deepen the history up to
+that version::
+
+ git fetch --shallow-exclude=v5.18 mainline
+
+Afterwards add the stable Git repository as remote and all required stable
+branches as explained in the step-by-step guide.
+
+Note, shallow clones have a few peculiar characteristics:
+
+* For bisections the history needs to be deepened a few mainline versions
+ farther than it seems necessary, as explained above already. That's because
+ Git otherwise will be unable to revert or describe most of the commits within
+ a range (say 6.1..6.2), as they are internally based on earlier kernels
+ releases (like 6.0-rc2 or 5.19-rc3).
+
+* This document in most places uses ``git fetch`` with ``--shallow-exclude=``
+ to specify the earliest version you care about (or to be precise: its git
+ tag). You alternatively can use the parameter ``--shallow-since=`` to specify
+ an absolute (say ``'2023-07-15'``) or relative (``'12 months'``) date to
+ define the depth of the history you want to download. When using them while
+ bisecting mainline, ensure to deepen the history to at least 7 months before
+ the release of the mainline release your 'good' kernel is based on.
+
+* Be warned, when deepening your clone you might encounter an error like
+ 'fatal: error in object: unshallow cafecaca0c0dacafecaca0c0dacafecaca0c0da'.
+ In that case run ``git repack -d`` and try again.
+
+[:ref:`back to step-by-step guide <sources_bissbs>`]
+[:ref:`back to section intro <sources_bisref>`]
+
+.. _oldconfig_bisref:
+
+Start defining the build configuration for your kernel
+------------------------------------------------------
+
+ *Start preparing a kernel build configuration (the '.config' file).*
+ [:ref:`... <oldconfig_bissbs>`]
+
+*Note, this is the first of multiple steps in this guide that create or modify
+build artifacts. The commands used in this guide store them right in the source
+tree to keep things simple. In case you prefer storing the build artifacts
+separately, create a directory like '~/linux-builddir/' and add the parameter
+``O=~/linux-builddir/`` to all make calls used throughout this guide. You will
+have to point other commands there as well -- among them the ``./scripts/config
+[...]`` commands, which will require ``--file ~/linux-builddir/.config`` to
+locate the right build configuration.*
+
+Two things can easily go wrong when creating a .config file as advised:
+
+* The oldconfig target will use a .config file from your build directory, if
+ one is already present there (e.g. '~/linux/.config'). That's totally fine if
+ that's what you intend (see next step), but in all other cases you want to
+ delete it. This for example is important in case you followed this guide
+ further, but due to problems come back here to redo the configuration from
+ scratch.
+
+* Sometimes olddefconfig is unable to locate the .config file for your running
+ kernel and will use defaults, as briefly outlined in the guide. In that case
+ check if your distribution ships the configuration somewhere and manually put
+ it in the right place (e.g. '~/linux/.config') if it does. On distributions
+ where /proc/config.gz exists this can be achieved using this command::
+
+ zcat /proc/config.gz > .config
+
+ Once you put it there, run ``make olddefconfig`` again to adjust it to the
+ needs of the kernel about to be built.
+
+Note, the olddefconfig target will set any undefined build options to their
+default value. If you prefer to set such configuration options manually, use
+``make oldconfig`` instead. Then for each undefined configuration option you
+will be asked how to proceed; in case you are unsure what to answer, simply hit
+'enter' to apply the default value. Note though that for bisections you normally
+want to go with the defaults, as you otherwise might enable a new feature that
+causes a problem looking like regressions (for example due to security
+restrictions).
+
+Occasionally odd things happen when trying to use a config file prepared for one
+kernel (say 6.1) on an older mainline release -- especially if it is much older
+(say 5.15). That's one of the reasons why the previous step in the guide told
+you to boot the kernel where everything works. If you manually add a .config
+file you thus want to ensure it's from the working kernel and not from a one
+that shows the regression.
+
+In case you want to build kernels for another machine, locate its kernel build
+configuration; usually ``ls /boot/config-$(uname -r)`` will print its name. Copy
+that file to the build machine and store it as ~/linux/.config; afterwards run
+``make olddefconfig`` to adjust it.
+
+[:ref:`back to step-by-step guide <oldconfig_bissbs>`]
+
+.. _localmodconfig_bisref:
+
+Trim the build configuration for your kernel
+--------------------------------------------
+
+ *Disable any kernel modules apparently superfluous for your setup.*
+ [:ref:`... <localmodconfig_bissbs>`]
+
+As explained briefly in the step-by-step guide already: with localmodconfig it
+can easily happen that your self-built kernels will lack modules for tasks you
+did not perform at least once before utilizing this make target. That happens
+when a task requires kernel modules which are only autoloaded when you execute
+it for the first time. So when you never performed that task since starting your
+kernel the modules will not have been loaded -- and from localmodonfig's point
+of view look superfluous, which thus disables them to reduce the amount of code
+to be compiled.
+
+You can try to avoid this by performing typical tasks that often will autoload
+additional kernel modules: start a VM, establish VPN connections, loop-mount a
+CD/DVD ISO, mount network shares (CIFS, NFS, ...), and connect all external
+devices (2FA keys, headsets, webcams, ...) as well as storage devices with file
+systems you otherwise do not utilize (btrfs, ext4, FAT, NTFS, XFS, ...). But it
+is hard to think of everything that might be needed -- even kernel developers
+often forget one thing or another at this point.
+
+Do not let that risk bother you, especially when compiling a kernel only for
+testing purposes: everything typically crucial will be there. And if you forget
+something important you can turn on a missing feature manually later and quickly
+run the commands again to compile and install a kernel that has everything you
+need.
+
+But if you plan to build and use self-built kernels regularly, you might want to
+reduce the risk by recording which modules your system loads over the course of
+a few weeks. You can automate this with `modprobed-db
+<https://github.com/graysky2/modprobed-db>`_. Afterwards use ``LSMOD=<path>`` to
+point localmodconfig to the list of modules modprobed-db noticed being used::
+
+ yes '' | make LSMOD='${HOME}'/.config/modprobed.db localmodconfig
+
+That parameter also allows you to build trimmed kernels for another machine in
+case you copied a suitable .config over to use as base (see previous step). Just
+run ``lsmod > lsmod_foo-machine`` on that system and copy the generated file to
+your build's host home directory. Then run these commands instead of the one the
+step-by-step guide mentions::
+
+ yes '' | make LSMOD=~/lsmod_foo-machine localmodconfig
+
+[:ref:`back to step-by-step guide <localmodconfig_bissbs>`]
+
+.. _tagging_bisref:
+
+Tag the kernels about to be build
+---------------------------------
+
+ *Ensure all the kernels you will build are clearly identifiable using a
+ special tag and a unique version identifier.* [:ref:`... <tagging_bissbs>`]
+
+This allows you to differentiate your distribution's kernels from those created
+during this process, as the file or directories for the latter will contain
+'-local' in the name; it also helps picking the right entry in the boot menu and
+not lose track of you kernels, as their version numbers will look slightly
+confusing during the bisection.
+
+[:ref:`back to step-by-step guide <tagging_bissbs>`]
+
+.. _debugsymbols_bisref:
+
+Decide to enable or disable debug symbols
+-----------------------------------------
+
+ *Decide how to handle debug symbols.* [:ref:`... <debugsymbols_bissbs>`]
+
+Having debug symbols available can be important when your kernel throws a
+'panic', 'Oops', 'warning', or 'BUG' later when running, as then you will be
+able to find the exact place where the problem occurred in the code. But
+collecting and embedding the needed debug information takes time and consumes
+quite a bit of space: in late 2022 the build artifacts for a typical x86 kernel
+trimmed with localmodconfig consumed around 5 Gigabyte of space with debug
+symbols, but less than 1 when they were disabled. The resulting kernel image and
+modules are bigger as well, which increases storage requirements for /boot/ and
+load times.
+
+In case you want a small kernel and are unlikely to decode a stack trace later,
+you thus might want to disable debug symbols to avoid those downsides. If it
+later turns out that you need them, just enable them as shown and rebuild the
+kernel.
+
+You on the other hand definitely want to enable them for this process, if there
+is a decent chance that you need to decode a stack trace later. The section
+'Decode failure messages' in Documentation/admin-guide/reporting-issues.rst
+explains this process in more detail.
+
+[:ref:`back to step-by-step guide <debugsymbols_bissbs>`]
+
+.. _configmods_bisref:
+
+Adjust build configuration
+--------------------------
+
+ *Check if you may want or need to adjust some other kernel configuration
+ options:*
+
+Depending on your needs you at this point might want or have to adjust some
+kernel configuration options.
+
+.. _configmods_distros_bisref:
+
+Distro specific adjustments
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ *Are you running* [:ref:`... <configmods_bissbs>`]
+
+The following sections help you to avoid build problems that are known to occur
+when following this guide on a few commodity distributions.
+
+**Debian:**
+
+* Remove a stale reference to a certificate file that would cause your build to
+ fail::
+
+ ./scripts/config --set-str SYSTEM_TRUSTED_KEYS ''
+
+ Alternatively, download the needed certificate and make that configuration
+ option point to it, as `the Debian handbook explains in more detail
+ <https://debian-handbook.info/browse/stable/sect.kernel-compilation.html>`_
+ -- or generate your own, as explained in
+ Documentation/admin-guide/module-signing.rst.
+
+[:ref:`back to step-by-step guide <configmods_bissbs>`]
+
+.. _configmods_individual_bisref:
+
+Individual adjustments
+~~~~~~~~~~~~~~~~~~~~~~
+
+ *If you want to influence the other aspects of the configuration, do so
+ now.* [:ref:`... <configmods_bissbs>`]
+
+At this point you can use a command like ``make menuconfig`` or ``make nconfig``
+to enable or disable certain features using a text-based user interface; to use
+a graphical configuration utility, run ``make xconfig`` instead. Both of them
+require development libraries from toolkits they are rely on (ncurses
+respectively Qt5 or Qt6); an error message will tell you if something required
+is missing.
+
+[:ref:`back to step-by-step guide <configmods_bissbs>`]
+
+.. _saveconfig_bisref:
+
+Put the .config file aside
+--------------------------
+
+ *Reprocess the .config after the latest changes and store it in a safe place.*
+ [:ref:`... <saveconfig_bissbs>`]
+
+Put the .config you prepared aside, as you want to copy it back to the build
+directory every time during this guide before you start building another
+kernel. That's because going back and forth between different versions can alter
+.config files in odd ways; those occasionally cause side effects that could
+confuse testing or in some cases render the result of your bisection
+meaningless.
+
+[:ref:`back to step-by-step guide <saveconfig_bissbs>`]
+
+.. _introlatestcheck_bisref:
+
+Try to reproduce the regression
+-----------------------------------------
+
+ *Verify the regression is not caused by some .config change and check if it
+ still occurs with the latest codebase.* [:ref:`... <introlatestcheck_bissbs>`]
+
+For some readers it might seem unnecessary to check the latest codebase at this
+point, especially if you did that already with a kernel prepared by your
+distributor or face a regression within a stable/longterm series. But it's
+highly recommended for these reasons:
+
+* You will run into any problems caused by your setup before you actually begin
+ a bisection. That will make it a lot easier to differentiate between 'this
+ most likely is some problem in my setup' and 'this change needs to be skipped
+ during the bisection, as the kernel sources at that stage contain an unrelated
+ problem that causes building or booting to fail'.
+
+* These steps will rule out if your problem is caused by some change in the
+ build configuration between the 'working' and the 'broken' kernel. This for
+ example can happen when your distributor enabled an additional security
+ feature in the newer kernel which was disabled or not yet supported by the
+ older kernel. That security feature might get into the way of something you
+ do -- in which case your problem from the perspective of the Linux kernel
+ upstream developers is not a regression, as
+ Documentation/admin-guide/reporting-regressions.rst explains in more detail.
+ You thus would waste your time if you'd try to bisect this.
+
+* If the cause for your regression was already fixed in the latest mainline
+ codebase, you'd perform the bisection for nothing. This holds true for a
+ regression you encountered with a stable/longterm release as well, as they are
+ often caused by problems in mainline changes that were backported -- in which
+ case the problem will have to be fixed in mainline first. Maybe it already was
+ fixed there and the fix is already in the process of being backported.
+
+* For regressions within a stable/longterm series it's furthermore crucial to
+ know if the issue is specific to that series or also happens in the mainline
+ kernel, as the report needs to be sent to different people:
+
+ * Regressions specific to a stable/longterm series are the stable team's
+ responsibility; mainline Linux developers might or might not care.
+
+ * Regressions also happening in mainline are something the regular Linux
+ developers and maintainers have to handle; the stable team does not care
+ and does not need to be involved in the report, they just should be told
+ to backport the fix once it's ready.
+
+ Your report might be ignored if you send it to the wrong party -- and even
+ when you get a reply there is a decent chance that developers tell you to
+ evaluate which of the two cases it is before they take a closer look.
+
+[:ref:`back to step-by-step guide <introlatestcheck_bissbs>`]
+
+.. _checkoutmaster_bisref:
+
+Check out the latest Linux codebase
+-----------------------------------
+
+ *Check out the latest Linux codebase.*
+ [:ref:`... <introlatestcheck_bissbs>`]
+
+In case you later want to recheck if an ever newer codebase might fix the
+problem, remember to run that ``git fetch --shallow-exclude [...]`` command
+again mentioned earlier to update your local Git repository.
+
+[:ref:`back to step-by-step guide <introlatestcheck_bissbs>`]
+
+.. _build_bisref:
+
+Build your kernel
+-----------------
+
+ *Build the image and the modules of your first kernel using the config file
+ you prepared.* [:ref:`... <build_bissbs>`]
+
+A lot can go wrong at this stage, but the instructions below will help you help
+yourself. Another subsection explains how to directly package your kernel up as
+deb, rpm or tar file.
+
+Dealing with build errors
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When a build error occurs, it might be caused by some aspect of your machine's
+setup that often can be fixed quickly; other times though the problem lies in
+the code and can only be fixed by a developer. A close examination of the
+failure messages coupled with some research on the internet will often tell you
+which of the two it is. To perform such investigation, restart the build
+process like this::
+
+ make V=1
+
+The ``V=1`` activates verbose output, which might be needed to see the actual
+error. To make it easier to spot, this command also omits the ``-j $(nproc
+--all)`` used earlier to utilize every CPU core in the system for the job -- but
+this parallelism also results in some clutter when failures occur.
+
+After a few seconds the build process should run into the error again. Now try
+to find the most crucial line describing the problem. Then search the internet
+for the most important and non-generic section of that line (say 4 to 8 words);
+avoid or remove anything that looks remotely system-specific, like your username
+or local path names like ``/home/username/linux/``. First try your regular
+internet search engine with that string, afterwards search Linux kernel mailing
+lists via `lore.kernel.org/all/ <https://lore.kernel.org/all/>`_.
+
+This most of the time will find something that will explain what is wrong; quite
+often one of the hits will provide a solution for your problem, too. If you
+do not find anything that matches your problem, try again from a different angle
+by modifying your search terms or using another line from the error messages.
+
+In the end, most issues you run into have likely been encountered and
+reported by others already. That includes issues where the cause is not your
+system, but lies in the code. If you run into one of those, you might thus find a
+solution (e.g. a patch) or workaround for your issue, too.
+
+Package your kernel up
+~~~~~~~~~~~~~~~~~~~~~~
+
+The step-by-step guide uses the default make targets (e.g. 'bzImage' and
+'modules' on x86) to build the image and the modules of your kernel, which later
+steps of the guide then install. You instead can also directly build everything
+and directly package it up by using one of the following targets:
+
+* ``make -j $(nproc --all) bindeb-pkg`` to generate a deb package
+
+* ``make -j $(nproc --all) binrpm-pkg`` to generate a rpm package
+
+* ``make -j $(nproc --all) tarbz2-pkg`` to generate a bz2 compressed tarball
+
+This is just a selection of available make targets for this purpose, see
+``make help`` for others. You can also use these targets after running
+``make -j $(nproc --all)``, as they will pick up everything already built.
+
+If you employ the targets to generate deb or rpm packages, ignore the
+step-by-step guide's instructions on installing and removing your kernel;
+instead install and remove the packages using the package utility for the format
+(e.g. dpkg and rpm) or a package management utility build on top of them (apt,
+aptitude, dnf/yum, zypper, ...). Be aware that the packages generated using
+these two make targets are designed to work on various distributions utilizing
+those formats, they thus will sometimes behave differently than your
+distribution's kernel packages.
+
+[:ref:`back to step-by-step guide <build_bissbs>`]
+
+.. _install_bisref:
+
+Put the kernel in place
+-----------------------
+
+ *Install the kernel you just built.* [:ref:`... <install_bissbs>`]
+
+What you need to do after executing the command in the step-by-step guide
+depends on the existence and the implementation of ``/sbin/installkernel``
+executable on your distribution.
+
+If installkernel is found, the kernel's build system will delegate the actual
+installation of your kernel image to this executable, which then performs some
+or all of these tasks:
+
+* On almost all Linux distributions installkernel will store your kernel's
+ image in /boot/, usually as '/boot/vmlinuz-<kernelrelease_id>'; often it will
+ put a 'System.map-<kernelrelease_id>' alongside it.
+
+* On most distributions installkernel will then generate an 'initramfs'
+ (sometimes also called 'initrd'), which usually are stored as
+ '/boot/initramfs-<kernelrelease_id>.img' or
+ '/boot/initrd-<kernelrelease_id>'. Commodity distributions rely on this file
+ for booting, hence ensure to execute the make target 'modules_install' first,
+ as your distribution's initramfs generator otherwise will be unable to find
+ the modules that go into the image.
+
+* On some distributions installkernel will then add an entry for your kernel
+ to your bootloader's configuration.
+
+You have to take care of some or all of the tasks yourself, if your
+distribution lacks a installkernel script or does only handle part of them.
+Consult the distribution's documentation for details. If in doubt, install the
+kernel manually::
+
+ sudo install -m 0600 $(make -s image_name) /boot/vmlinuz-$(make -s kernelrelease)
+ sudo install -m 0600 System.map /boot/System.map-$(make -s kernelrelease)
+
+Now generate your initramfs using the tools your distribution provides for this
+process. Afterwards add your kernel to your bootloader configuration and reboot.
+
+[:ref:`back to step-by-step guide <install_bissbs>`]
+
+.. _storagespace_bisref:
+
+Storage requirements per kernel
+-------------------------------
+
+ *Check how much storage space the kernel, its modules, and other related files
+ like the initramfs consume.* [:ref:`... <storagespace_bissbs>`]
+
+The kernels built during a bisection consume quite a bit of space in /boot/ and
+/lib/modules/, especially if you enabled debug symbols. That makes it easy to
+fill up volumes during a bisection -- and due to that even kernels which used to
+work earlier might fail to boot. To prevent that you will need to know how much
+space each installed kernel typically requires.
+
+Note, most of the time the pattern '/boot/*$(make -s kernelrelease)*' used in
+the guide will match all files needed to boot your kernel -- but neither the
+path nor the naming scheme are mandatory. On some distributions you thus will
+need to look in different places.
+
+[:ref:`back to step-by-step guide <storagespace_bissbs>`]
+
+.. _tainted_bisref:
+
+Check if your newly built kernel considers itself 'tainted'
+-----------------------------------------------------------
+
+ *Check if the kernel marked itself as 'tainted'.*
+ [:ref:`... <tainted_bissbs>`]
+
+Linux marks itself as tainted when something happens that potentially leads to
+follow-up errors that look totally unrelated. That is why developers might
+ignore or react scantly to reports from tainted kernels -- unless of course the
+kernel set the flag right when the reported bug occurred.
+
+That's why you want check why a kernel is tainted as explained in
+Documentation/admin-guide/tainted-kernels.rst; doing so is also in your own
+interest, as your testing might be flawed otherwise.
+
+[:ref:`back to step-by-step guide <tainted_bissbs>`]
+
+.. _recheckbroken_bisref:
+
+Check the kernel built from a recent mainline codebase
+------------------------------------------------------
+
+ *Verify if your bug occurs with the newly built kernel.*
+ [:ref:`... <recheckbroken_bissbs>`]
+
+There are a couple of reasons why your bug or regression might not show up with
+the kernel you built from the latest codebase. These are the most frequent:
+
+* The bug was fixed meanwhile.
+
+* What you suspected to be a regression was caused by a change in the build
+ configuration the provider of your kernel carried out.
+
+* Your problem might be a race condition that does not show up with your kernel;
+ the trimmed build configuration, a different setting for debug symbols, the
+ compiler used, and various other things can cause this.
+
+* In case you encountered the regression with a stable/longterm kernel it might
+ be a problem that is specific to that series; the next step in this guide will
+ check this.
+
+[:ref:`back to step-by-step guide <recheckbroken_bissbs>`]
+
+.. _recheckstablebroken_bisref:
+
+Check the kernel built from the latest stable/longterm codebase
+---------------------------------------------------------------
+
+ *Are you facing a regression within a stable/longterm release, but failed to
+ reproduce it with the kernel you just built using the latest mainline sources?
+ Then check if the latest codebase for the particular series might already fix
+ the problem.* [:ref:`... <recheckstablebroken_bissbs>`]
+
+If this kernel does not show the regression either, there most likely is no need
+for a bisection.
+
+[:ref:`back to step-by-step guide <recheckstablebroken_bissbs>`]
+
+.. _introworkingcheck_bisref:
+
+Ensure the 'good' version is really working well
+------------------------------------------------
+
+ *Check if the kernels you build work fine.*
+ [:ref:`... <introworkingcheck_bissbs>`]
+
+This section will reestablish a known working base. Skipping it might be
+appealing, but is usually a bad idea, as it does something important:
+
+It will ensure the .config file you prepared earlier actually works as expected.
+That is in your own interest, as trimming the configuration is not foolproof --
+and you might be building and testing ten or more kernels for nothing before
+starting to suspect something might be wrong with the build configuration.
+
+That alone is reason enough to spend the time on this, but not the only reason.
+
+Many readers of this guide normally run kernels that are patched, use add-on
+modules, or both. Those kernels thus are not considered 'vanilla' -- therefore
+it's possible that the thing that regressed might never have worked in vanilla
+builds of the 'good' version in the first place.
+
+There is a third reason for those that noticed a regression between
+stable/longterm kernels of different series (e.g. 6.0.13..6.1.5): it will
+ensure the kernel version you assumed to be 'good' earlier in the process (e.g.
+6.0) actually is working.
+
+[:ref:`back to step-by-step guide <introworkingcheck_bissbs>`]
+
+.. _recheckworking_bisref:
+
+Build your own version of the 'good' kernel
+-------------------------------------------
+
+ *Build your own variant of the working kernel and check if the feature that
+ regressed works as expected with it.* [:ref:`... <recheckworking_bissbs>`]
+
+In case the feature that broke with newer kernels does not work with your first
+self-built kernel, find and resolve the cause before moving on. There are a
+multitude of reasons why this might happen. Some ideas where to look:
+
+* Check the taint status and the output of ``dmesg``, maybe something unrelated
+ went wrong.
+
+* Maybe localmodconfig did something odd and disabled the module required to
+ test the feature? Then you might want to recreate a .config file based on the
+ one from the last working kernel and skip trimming it down; manually disabling
+ some features in the .config might work as well to reduce the build time.
+
+* Maybe it's not a kernel regression and something that is caused by some fluke,
+ a broken initramfs (also known as initrd), new firmware files, or an updated
+ userland software?
+
+* Maybe it was a feature added to your distributor's kernel which vanilla Linux
+ at that point never supported?
+
+Note, if you found and fixed problems with the .config file, you want to use it
+to build another kernel from the latest codebase, as your earlier tests with
+mainline and the latest version from an affected stable/longterm series were most
+likely flawed.
+
+[:ref:`back to step-by-step guide <recheckworking_bissbs>`]
+
+.. _bisectstart_bisref:
+
+Start the bisection
+-------------------
+
+ *Start the bisection and tell Git about the versions earlier established as
+ 'good' and 'bad'.* [:ref:`... <bisectstart_bissbs>`]
+
+This will start the bisection process; the last of the commands will make Git
+check out a commit round about half-way between the 'good' and the 'bad' changes
+for you to test.
+
+[:ref:`back to step-by-step guide <bisectstart_bissbs>`]
+
+.. _bisectbuild_bisref:
+
+Build a kernel from the bisection point
+---------------------------------------
+
+ *Build, install, and boot a kernel from the code Git checked out using the
+ same commands you used earlier.* [:ref:`... <bisectbuild_bissbs>`]
+
+There are two things worth of note here:
+
+* Occasionally building the kernel will fail or it might not boot due some
+ problem in the code at the bisection point. In that case run this command::
+
+ git bisect skip
+
+ Git will then check out another commit nearby which with a bit of luck should
+ work better. Afterwards restart executing this step.
+
+* Those slightly odd looking version identifiers can happen during bisections,
+ because the Linux kernel subsystems prepare their changes for a new mainline
+ release (say 6.2) before its predecessor (e.g. 6.1) is finished. They thus
+ base them on a somewhat earlier point like 6.1-rc1 or even 6.0 -- and then
+ get merged for 6.2 without rebasing nor squashing them once 6.1 is out. This
+ leads to those slightly odd looking version identifiers coming up during
+ bisections.
+
+[:ref:`back to step-by-step guide <bisectbuild_bissbs>`]
+
+.. _bisecttest_bisref:
+
+Bisection checkpoint
+--------------------
+
+ *Check if the feature that regressed works in the kernel you just built.*
+ [:ref:`... <bisecttest_bissbs>`]
+
+Ensure what you tell Git is accurate: getting it wrong just one time will bring
+the rest of the bisection totally off course, hence all testing after that point
+will be for nothing.
+
+[:ref:`back to step-by-step guide <bisecttest_bissbs>`]
+
+.. _bisectlog_bisref:
+
+Put the bisection log away
+--------------------------
+
+ *Store Git's bisection log and the current .config file in a safe place.*
+ [:ref:`... <bisectlog_bissbs>`]
+
+As indicated above: declaring just one kernel wrongly as 'good' or 'bad' will
+render the end result of a bisection useless. In that case you'd normally have
+to restart the bisection from scratch. The log can prevent that, as it might
+allow someone to point out where a bisection likely went sideways -- and then
+instead of testing ten or more kernels you might only have to build a few to
+resolve things.
+
+The .config file is put aside, as there is a decent chance that developers might
+ask for it after you report the regression.
+
+[:ref:`back to step-by-step guide <bisectlog_bissbs>`]
+
+.. _revert_bisref:
+
+Try reverting the culprit
+-------------------------
+
+ *Try reverting the culprit on top of the latest codebase to see if this fixes
+ your regression.* [:ref:`... <revert_bissbs>`]
+
+This is an optional step, but whenever possible one you should try: there is a
+decent chance that developers will ask you to perform this step when you bring
+the bisection result up. So give it a try, you are in the flow already, building
+one more kernel shouldn't be a big deal at this point.
+
+The step-by-step guide covers everything relevant already except one slightly
+rare thing: did you bisected a regression that also happened with mainline using
+a stable/longterm series, but Git failed to revert the commit in mainline? Then
+try to revert the culprit in the affected stable/longterm series -- and if that
+succeeds, test that kernel version instead.
+
+[:ref:`back to step-by-step guide <revert_bissbs>`]
+
+
+Supplementary tasks: cleanup during and after the bisection
+-----------------------------------------------------------
+
+.. _makeroom_bisref:
+
+Cleaning up during the bisection
+--------------------------------
+
+ *To remove one of the kernels you installed, look up its 'kernelrelease'
+ identifier.* [:ref:`... <makeroom_bissbs>`]
+
+The kernels you install during this process are easy to remove later, as its
+parts are only stored in two places and clearly identifiable. You thus do not
+need to worry to mess up your machine when you install a kernel manually (and
+thus bypass your distribution's packaging system): all parts of your kernels are
+relatively easy to remove later.
+
+One of the two places is a directory in /lib/modules/, which holds the modules
+for each installed kernel. This directory is named after the kernel's release
+identifier; hence, to remove all modules for one of the kernels you built,
+simply remove its modules directory in /lib/modules/.
+
+The other place is /boot/, where typically two up to five files will be placed
+during installation of a kernel. All of them usually contain the release name in
+their file name, but how many files and their exact names depend somewhat on
+your distribution's installkernel executable and its initramfs generator. On
+some distributions the ``kernel-install remove...`` command mentioned in the
+step-by-step guide will delete all of these files for you while also removing
+the menu entry for the kernel from your bootloader configuration. On others you
+have to take care of these two tasks yourself. The following command should
+interactively remove the three main files of a kernel with the release name
+'6.0-rc1-local-gcafec0cacaca0'::
+
+ rm -i /boot/{System.map,vmlinuz,initr}-6.0-rc1-local-gcafec0cacaca0
+
+Afterwards check for other files in /boot/ that have
+'6.0-rc1-local-gcafec0cacaca0' in their name and consider deleting them as well.
+Now remove the boot entry for the kernel from your bootloader's configuration;
+the steps to do that vary quite a bit between Linux distributions.
+
+Note, be careful with wildcards like '*' when deleting files or directories
+for kernels manually: you might accidentally remove files of a 6.0.11 kernel
+when all you want is to remove 6.0 or 6.0.1.
+
+[:ref:`back to step-by-step guide <makeroom_bissbs>`]
+
+Cleaning up after the bisection
+-------------------------------
+
+.. _finishingtouch_bisref:
+
+ *Once you have finished the bisection, do not immediately remove anything
+ you set up, as you might need a few things again.*
+ [:ref:`... <finishingtouch_bissbs>`]
+
+When you are really short of storage space removing the kernels as described in
+the step-by-step guide might not free as much space as you would like. In that
+case consider running ``rm -rf ~/linux/*`` as well now. This will remove the
+build artifacts and the Linux sources, but will leave the Git repository
+(~/linux/.git/) behind -- a simple ``git reset --hard`` thus will bring the
+sources back.
+
+Removing the repository as well would likely be unwise at this point: there is a
+decent chance developers will ask you to build another kernel to perform
+additional tests. This is often required to debug an issue or check proposed
+fixes. Before doing so you want to run the ``git fetch mainline`` command again
+followed by ``git checkout mainline/master`` to bring your clone up to date and
+checkout the latest codebase. Then apply the patch using ``git apply
+<filename>`` or ``git am <filename>`` and build yet another kernel using the
+familiar commands.
+
+Additional tests are also the reason why you want to keep the
+~/kernel-config-working file around for a few weeks.
+
+[:ref:`back to step-by-step guide <finishingtouch_bissbs>`]
+
+
+Additional reading material
+===========================
+
+Further sources
+---------------
+
+* The `man page for 'git bisect' <https://git-scm.com/docs/git-bisect>`_ and
+ `fighting regressions with 'git bisect' <https://git-scm.com/docs/git-bisect-lk2009.html>`_
+ in the Git documentation.
+* `Working with git bisect <https://nathanchance.dev/posts/working-with-git-bisect/>`_
+ from kernel developer Nathan Chancellor.
+* `Using Git bisect to figure out when brokenness was introduced <http://webchick.net/node/99>`_.
+* `Fully automated bisecting with 'git bisect run' <https://lwn.net/Articles/317154>`_.
+
+..
+ end-of-content
+..
+ This document is maintained by Thorsten Leemhuis <linux@leemhuis.info>. If
+ you spot a typo or small mistake, feel free to let him know directly and
+ he'll fix it. You are free to do the same in a mostly informal way if you
+ want to contribute changes to the text -- but for copyright reasons please CC
+ linux-doc@vger.kernel.org and 'sign-off' your contribution as
+ Documentation/process/submitting-patches.rst explains in the section 'Sign
+ your work - the Developer's Certificate of Origin'.
+..
+ This text is available under GPL-2.0+ or CC-BY-4.0, as stated at the top
+ of the file. If you want to distribute this text under CC-BY-4.0 only,
+ please use 'The Linux kernel development community' for author attribution
+ and link this as source:
+ https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/plain/Documentation/admin-guide/verify-bugs-and-bisect-regressions.rst
+
+..
+ Note: Only the content of this RST file as found in the Linux kernel sources
+ is available under CC-BY-4.0, as versions of this text that were processed
+ (for example by the kernel's build system) might contain content taken from
+ files which use a more restrictive license.
diff --git a/Documentation/arch/arm64/elf_hwcaps.rst b/Documentation/arch/arm64/elf_hwcaps.rst
index ced7b33..448c166 100644
--- a/Documentation/arch/arm64/elf_hwcaps.rst
+++ b/Documentation/arch/arm64/elf_hwcaps.rst
@@ -317,6 +317,55 @@
HWCAP2_LSE128
Functionality implied by ID_AA64ISAR0_EL1.Atomic == 0b0011.
+HWCAP2_FPMR
+ Functionality implied by ID_AA64PFR2_EL1.FMR == 0b0001.
+
+HWCAP2_LUT
+ Functionality implied by ID_AA64ISAR2_EL1.LUT == 0b0001.
+
+HWCAP2_FAMINMAX
+ Functionality implied by ID_AA64ISAR3_EL1.FAMINMAX == 0b0001.
+
+HWCAP2_F8CVT
+ Functionality implied by ID_AA64FPFR0_EL1.F8CVT == 0b1.
+
+HWCAP2_F8FMA
+ Functionality implied by ID_AA64FPFR0_EL1.F8FMA == 0b1.
+
+HWCAP2_F8DP4
+ Functionality implied by ID_AA64FPFR0_EL1.F8DP4 == 0b1.
+
+HWCAP2_F8DP2
+ Functionality implied by ID_AA64FPFR0_EL1.F8DP2 == 0b1.
+
+HWCAP2_F8E4M3
+ Functionality implied by ID_AA64FPFR0_EL1.F8E4M3 == 0b1.
+
+HWCAP2_F8E5M2
+ Functionality implied by ID_AA64FPFR0_EL1.F8E5M2 == 0b1.
+
+HWCAP2_SME_LUTV2
+ Functionality implied by ID_AA64SMFR0_EL1.LUTv2 == 0b1.
+
+HWCAP2_SME_F8F16
+ Functionality implied by ID_AA64SMFR0_EL1.F8F16 == 0b1.
+
+HWCAP2_SME_F8F32
+ Functionality implied by ID_AA64SMFR0_EL1.F8F32 == 0b1.
+
+HWCAP2_SME_SF8FMA
+ Functionality implied by ID_AA64SMFR0_EL1.SF8FMA == 0b1.
+
+HWCAP2_SME_SF8DP4
+ Functionality implied by ID_AA64SMFR0_EL1.SF8DP4 == 0b1.
+
+HWCAP2_SME_SF8DP2
+ Functionality implied by ID_AA64SMFR0_EL1.SF8DP2 == 0b1.
+
+HWCAP2_SME_SF8DP4
+ Functionality implied by ID_AA64SMFR0_EL1.SF8DP4 == 0b1.
+
+
4. Unused AT_HWCAP bits
-----------------------
diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst
index e8c2ce1..d33e27c 100644
--- a/Documentation/arch/arm64/silicon-errata.rst
+++ b/Documentation/arch/arm64/silicon-errata.rst
@@ -35,8 +35,9 @@
For software workarounds that may adversely impact systems unaffected by
the erratum in question, a Kconfig entry is added under "Kernel
Features" -> "ARM errata workarounds via the alternatives framework".
-These are enabled by default and patched in at runtime when an affected
-CPU is detected. For less-intrusive workarounds, a Kconfig option is not
+With the exception of workarounds for errata deemed "rare" by Arm, these
+are enabled by default and patched in at runtime when an affected CPU is
+detected. For less-intrusive workarounds, a Kconfig option is not
available and the code is structured (preferably with a comment) in such
a way that the erratum will not be hit.
@@ -243,3 +244,10 @@
+----------------+-----------------+-----------------+-----------------------------+
| ASR | ASR8601 | #8601001 | N/A |
+----------------+-----------------+-----------------+-----------------------------+
++----------------+-----------------+-----------------+-----------------------------+
+| Microsoft | Azure Cobalt 100| #2139208 | ARM64_ERRATUM_2139208 |
++----------------+-----------------+-----------------+-----------------------------+
+| Microsoft | Azure Cobalt 100| #2067961 | ARM64_ERRATUM_2067961 |
++----------------+-----------------+-----------------+-----------------------------+
+| Microsoft | Azure Cobalt 100| #2253138 | ARM64_ERRATUM_2253138 |
++----------------+-----------------+-----------------+-----------------------------+
diff --git a/Documentation/arch/arm64/sme.rst b/Documentation/arch/arm64/sme.rst
index 3d0e53e..be317d4 100644
--- a/Documentation/arch/arm64/sme.rst
+++ b/Documentation/arch/arm64/sme.rst
@@ -75,7 +75,7 @@
2. Vector lengths
------------------
-SME defines a second vector length similar to the SVE vector length which is
+SME defines a second vector length similar to the SVE vector length which
controls the size of the streaming mode SVE vectors and the ZA matrix array.
The ZA matrix is square with each side having as many bytes as a streaming
mode SVE vector.
@@ -238,12 +238,12 @@
bits of Z0..Z31 except for Z0 bits [127:0] .. Z31 bits [127:0] to become
unspecified, including both streaming and non-streaming SVE state.
Calling PR_SME_SET_VL with vl equal to the thread's current vector
- length, or calling PR_SME_SET_VL with the PR_SVE_SET_VL_ONEXEC flag,
+ length, or calling PR_SME_SET_VL with the PR_SME_SET_VL_ONEXEC flag,
does not constitute a change to the vector length for this purpose.
* Changing the vector length causes PSTATE.ZA and PSTATE.SM to be cleared.
Calling PR_SME_SET_VL with vl equal to the thread's current vector
- length, or calling PR_SME_SET_VL with the PR_SVE_SET_VL_ONEXEC flag,
+ length, or calling PR_SME_SET_VL with the PR_SME_SET_VL_ONEXEC flag,
does not constitute a change to the vector length for this purpose.
@@ -379,9 +379,8 @@
/proc/sys/abi/sme_default_vector_length
Writing the text representation of an integer to this file sets the system
- default vector length to the specified value, unless the value is greater
- than the maximum vector length supported by the system in which case the
- default vector length is set to that maximum.
+ default vector length to the specified value rounded to a supported value
+ using the same rules as for setting vector length via PR_SME_SET_VL.
The result can be determined by reopening the file and reading its
contents.
diff --git a/Documentation/arch/arm64/sve.rst b/Documentation/arch/arm64/sve.rst
index 0d9a426..8d8837f 100644
--- a/Documentation/arch/arm64/sve.rst
+++ b/Documentation/arch/arm64/sve.rst
@@ -117,11 +117,6 @@
* The SVE registers are not used to pass arguments to or receive results from
any syscall.
-* In practice the affected registers/bits will be preserved or will be replaced
- with zeros on return from a syscall, but userspace should not make
- assumptions about this. The kernel behaviour may vary on a case-by-case
- basis.
-
* All other SVE state of a thread, including the currently configured vector
length, the state of the PR_SVE_VL_INHERIT flag, and the deferred vector
length (if any), is preserved across all syscalls, subject to the specific
@@ -428,9 +423,8 @@
/proc/sys/abi/sve_default_vector_length
Writing the text representation of an integer to this file sets the system
- default vector length to the specified value, unless the value is greater
- than the maximum vector length supported by the system in which case the
- default vector length is set to that maximum.
+ default vector length to the specified value rounded to a supported value
+ using the same rules as for setting vector length via PR_SVE_SET_VL.
The result can be determined by reopening the file and reading its
contents.
diff --git a/Documentation/arch/riscv/vm-layout.rst b/Documentation/arch/riscv/vm-layout.rst
index 69ff6da..e476b43 100644
--- a/Documentation/arch/riscv/vm-layout.rst
+++ b/Documentation/arch/riscv/vm-layout.rst
@@ -144,14 +144,8 @@
smaller than sv48, the CPU maximum supported address space will be the default.
Software can "opt-in" to receiving VAs from another VA space by providing
-a hint address to mmap. A hint address passed to mmap will cause the largest
-address space that fits entirely into the hint to be used, unless there is no
-space left in the address space. If there is no space available in the requested
-address space, an address in the next smallest available address space will be
-returned.
-
-For example, in order to obtain 48-bit VA space, a hint address greater than
-:code:`1 << 47` must be provided. Note that this is 47 due to sv48 userspace
-ending at :code:`1 << 47` and the addresses beyond this are reserved for the
-kernel. Similarly, to obtain 57-bit VA space addresses, a hint address greater
-than or equal to :code:`1 << 56` must be provided.
+a hint address to mmap. When a hint address is passed to mmap, the returned
+address will never use more bits than the hint address. For example, if a hint
+address of `1 << 40` is passed to mmap, a valid returned address will never use
+bits 41 through 63. If no mappable addresses are available in that range, mmap
+will return `MAP_FAILED`.
diff --git a/Documentation/arch/x86/amd-memory-encryption.rst b/Documentation/arch/x86/amd-memory-encryption.rst
index 07caa8f..414bc74 100644
--- a/Documentation/arch/x86/amd-memory-encryption.rst
+++ b/Documentation/arch/x86/amd-memory-encryption.rst
@@ -87,14 +87,14 @@
kernel is non-zero).
SME can also be enabled and activated in the BIOS. If SME is enabled and
-activated in the BIOS, then all memory accesses will be encrypted and it will
-not be necessary to activate the Linux memory encryption support. If the BIOS
-merely enables SME (sets bit 23 of the MSR_AMD64_SYSCFG), then Linux can activate
-memory encryption by default (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y) or
-by supplying mem_encrypt=on on the kernel command line. However, if BIOS does
-not enable SME, then Linux will not be able to activate memory encryption, even
-if configured to do so by default or the mem_encrypt=on command line parameter
-is specified.
+activated in the BIOS, then all memory accesses will be encrypted and it
+will not be necessary to activate the Linux memory encryption support.
+
+If the BIOS merely enables SME (sets bit 23 of the MSR_AMD64_SYSCFG),
+then memory encryption can be enabled by supplying mem_encrypt=on on the
+kernel command line. However, if BIOS does not enable SME, then Linux
+will not be able to activate memory encryption, even if configured to do
+so by default or the mem_encrypt=on command line parameter is specified.
Secure Nested Paging (SNP)
==========================
diff --git a/Documentation/arch/x86/amd_hsmp.rst b/Documentation/arch/x86/amd_hsmp.rst
index c92bfd5..1e499ec 100644
--- a/Documentation/arch/x86/amd_hsmp.rst
+++ b/Documentation/arch/x86/amd_hsmp.rst
@@ -13,7 +13,8 @@
More details on the interface can be found in chapter
"7 Host System Management Port (HSMP)" of the family/model PPR
-Eg: https://www.amd.com/system/files/TechDocs/55898_B1_pub_0.50.zip
+Eg: https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/programmer-references/55898_B1_pub_0_50.zip
+
HSMP interface is supported on EPYC server CPU models only.
@@ -97,8 +98,8 @@
More details on the interface and message definitions can be found in chapter
"7 Host System Management Port (HSMP)" of the respective family/model PPR
-eg: https://www.amd.com/system/files/TechDocs/55898_B1_pub_0.50.zip
+eg: https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/programmer-references/55898_B1_pub_0_50.zip
User space C-APIs are made available by linking against the esmi library,
-which is provided by the E-SMS project https://developer.amd.com/e-sms/.
+which is provided by the E-SMS project https://www.amd.com/en/developer/e-sms.html.
See: https://github.com/amd/esmi_ib_library
diff --git a/Documentation/arch/x86/boot.rst b/Documentation/arch/x86/boot.rst
index c513855..4fd492cb 100644
--- a/Documentation/arch/x86/boot.rst
+++ b/Documentation/arch/x86/boot.rst
@@ -878,7 +878,8 @@
address if possible.
A non-relocatable kernel will unconditionally move itself and to run
- at this address.
+ at this address. A relocatable kernel will move itself to this address if it
+ loaded below this address.
============ =======
Field name: init_size
diff --git a/Documentation/arch/x86/mds.rst b/Documentation/arch/x86/mds.rst
index e73fdff..c58c723 100644
--- a/Documentation/arch/x86/mds.rst
+++ b/Documentation/arch/x86/mds.rst
@@ -95,6 +95,9 @@
mds_clear_cpu_buffers()
+Also macro CLEAR_CPU_BUFFERS can be used in ASM late in exit-to-user path.
+Other than CFLAGS.ZF, this macro doesn't clobber any registers.
+
The mitigation is invoked on kernel/userspace, hypervisor/guest and C-state
(idle) transitions.
@@ -138,17 +141,30 @@
When transitioning from kernel to user space the CPU buffers are flushed
on affected CPUs when the mitigation is not disabled on the kernel
- command line. The migitation is enabled through the static key
- mds_user_clear.
+ command line. The mitigation is enabled through the feature flag
+ X86_FEATURE_CLEAR_CPU_BUF.
- The mitigation is invoked in prepare_exit_to_usermode() which covers
- all but one of the kernel to user space transitions. The exception
- is when we return from a Non Maskable Interrupt (NMI), which is
- handled directly in do_nmi().
+ The mitigation is invoked just before transitioning to userspace after
+ user registers are restored. This is done to minimize the window in
+ which kernel data could be accessed after VERW e.g. via an NMI after
+ VERW.
- (The reason that NMI is special is that prepare_exit_to_usermode() can
- enable IRQs. In NMI context, NMIs are blocked, and we don't want to
- enable IRQs with NMIs blocked.)
+ **Corner case not handled**
+ Interrupts returning to kernel don't clear CPUs buffers since the
+ exit-to-user path is expected to do that anyways. But, there could be
+ a case when an NMI is generated in kernel after the exit-to-user path
+ has cleared the buffers. This case is not handled and NMI returning to
+ kernel don't clear CPU buffers because:
+
+ 1. It is rare to get an NMI after VERW, but before returning to userspace.
+ 2. For an unprivileged user, there is no known way to make that NMI
+ less rare or target it.
+ 3. It would take a large number of these precisely-timed NMIs to mount
+ an actual attack. There's presumably not enough bandwidth.
+ 4. The NMI in question occurs after a VERW, i.e. when user state is
+ restored and most interesting data is already scrubbed. Whats left
+ is only the data that NMI touches, and that may or may not be of
+ any interest.
2. C-State transition
diff --git a/Documentation/arch/x86/pti.rst b/Documentation/arch/x86/pti.rst
index e08d351..57e8392 100644
--- a/Documentation/arch/x86/pti.rst
+++ b/Documentation/arch/x86/pti.rst
@@ -26,9 +26,9 @@
This approach helps to ensure that side-channel attacks leveraging
the paging structures do not function when PTI is enabled. It can be
-enabled by setting CONFIG_PAGE_TABLE_ISOLATION=y at compile time.
-Once enabled at compile-time, it can be disabled at boot with the
-'nopti' or 'pti=' kernel parameters (see kernel-parameters.txt).
+enabled by setting CONFIG_MITIGATION_PAGE_TABLE_ISOLATION=y at compile
+time. Once enabled at compile-time, it can be disabled at boot with
+the 'nopti' or 'pti=' kernel parameters (see kernel-parameters.txt).
Page Table Management
=====================
diff --git a/Documentation/arch/x86/resctrl.rst b/Documentation/arch/x86/resctrl.rst
index a6279df..6c24558 100644
--- a/Documentation/arch/x86/resctrl.rst
+++ b/Documentation/arch/x86/resctrl.rst
@@ -45,7 +45,7 @@
Enable code/data prioritization in L2 cache allocations.
"mba_MBps":
Enable the MBA Software Controller(mba_sc) to specify MBA
- bandwidth in MBps
+ bandwidth in MiBps
"debug":
Make debug files accessible. Available debug files are annotated with
"Available only with debug option".
@@ -526,7 +526,7 @@
increase or vary although user specified bandwidth percentage is same.
In order to mitigate this and make the interface more user friendly,
-resctrl added support for specifying the bandwidth in MBps as well. The
+resctrl added support for specifying the bandwidth in MiBps as well. The
kernel underneath would use a software feedback mechanism or a "Software
Controller(mba_sc)" which reads the actual bandwidth using MBM counters
and adjust the memory bandwidth percentages to ensure::
@@ -573,13 +573,13 @@
MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;...
-Memory bandwidth Allocation specified in MBps
----------------------------------------------
+Memory bandwidth Allocation specified in MiBps
+----------------------------------------------
Memory bandwidth domain is L3 cache.
::
- MB:<cache_id0>=bw_MBps0;<cache_id1>=bw_MBps1;...
+ MB:<cache_id0>=bw_MiBps0;<cache_id1>=bw_MiBps1;...
Slow Memory Bandwidth Allocation (SMBA)
---------------------------------------
diff --git a/Documentation/arch/x86/topology.rst b/Documentation/arch/x86/topology.rst
index 08ebf9e..7352ab8 100644
--- a/Documentation/arch/x86/topology.rst
+++ b/Documentation/arch/x86/topology.rst
@@ -47,17 +47,21 @@
Package-related topology information in the kernel:
- - cpuinfo_x86.x86_max_cores:
+ - topology_num_threads_per_package()
- The number of cores in a package. This information is retrieved via CPUID.
+ The number of threads in a package.
- - cpuinfo_x86.x86_max_dies:
+ - topology_num_cores_per_package()
- The number of dies in a package. This information is retrieved via CPUID.
+ The number of cores in a package.
+
+ - topology_max_dies_per_package()
+
+ The maximum number of dies in a package.
- cpuinfo_x86.topo.die_id:
- The physical ID of the die. This information is retrieved via CPUID.
+ The physical ID of the die.
- cpuinfo_x86.topo.pkg_id:
@@ -96,16 +100,6 @@
AMDs nomenclature for a CMT core is "Compute Unit". The kernel always uses
"core".
-Core-related topology information in the kernel:
-
- - smp_num_siblings:
-
- The number of threads in a core. The number of threads in a package can be
- calculated by::
-
- threads_per_package = cpuinfo_x86.x86_max_cores * smp_num_siblings
-
-
Threads
=======
A thread is a single scheduling unit. It's the equivalent to a logical Linux
diff --git a/Documentation/arch/x86/x86_64/fred.rst b/Documentation/arch/x86/x86_64/fred.rst
new file mode 100644
index 0000000..9f57e7b
--- /dev/null
+++ b/Documentation/arch/x86/x86_64/fred.rst
@@ -0,0 +1,96 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=========================================
+Flexible Return and Event Delivery (FRED)
+=========================================
+
+Overview
+========
+
+The FRED architecture defines simple new transitions that change
+privilege level (ring transitions). The FRED architecture was
+designed with the following goals:
+
+1) Improve overall performance and response time by replacing event
+ delivery through the interrupt descriptor table (IDT event
+ delivery) and event return by the IRET instruction with lower
+ latency transitions.
+
+2) Improve software robustness by ensuring that event delivery
+ establishes the full supervisor context and that event return
+ establishes the full user context.
+
+The new transitions defined by the FRED architecture are FRED event
+delivery and, for returning from events, two FRED return instructions.
+FRED event delivery can effect a transition from ring 3 to ring 0, but
+it is used also to deliver events incident to ring 0. One FRED
+instruction (ERETU) effects a return from ring 0 to ring 3, while the
+other (ERETS) returns while remaining in ring 0. Collectively, FRED
+event delivery and the FRED return instructions are FRED transitions.
+
+In addition to these transitions, the FRED architecture defines a new
+instruction (LKGS) for managing the state of the GS segment register.
+The LKGS instruction can be used by 64-bit operating systems that do
+not use the new FRED transitions.
+
+Furthermore, the FRED architecture is easy to extend for future CPU
+architectures.
+
+Software based event dispatching
+================================
+
+FRED operates differently from IDT in terms of event handling. Instead
+of directly dispatching an event to its handler based on the event
+vector, FRED requires the software to dispatch an event to its handler
+based on both the event's type and vector. Therefore, an event dispatch
+framework must be implemented to facilitate the event-to-handler
+dispatch process. The FRED event dispatch framework takes control
+once an event is delivered, and employs a two-level dispatch.
+
+The first level dispatching is event type based, and the second level
+dispatching is event vector based.
+
+Full supervisor/user context
+============================
+
+FRED event delivery atomically save and restore full supervisor/user
+context upon event delivery and return. Thus it avoids the problem of
+transient states due to %cr2 and/or %dr6, and it is no longer needed
+to handle all the ugly corner cases caused by half baked entry states.
+
+FRED allows explicit unblock of NMI with new event return instructions
+ERETS/ERETU, avoiding the mess caused by IRET which unconditionally
+unblocks NMI, e.g., when an exception happens during NMI handling.
+
+FRED always restores the full value of %rsp, thus ESPFIX is no longer
+needed when FRED is enabled.
+
+LKGS
+====
+
+LKGS behaves like the MOV to GS instruction except that it loads the
+base address into the IA32_KERNEL_GS_BASE MSR instead of the GS
+segment’s descriptor cache. With LKGS, it ends up with avoiding
+mucking with kernel GS, i.e., an operating system can always operate
+with its own GS base address.
+
+Because FRED event delivery from ring 3 and ERETU both swap the value
+of the GS base address and that of the IA32_KERNEL_GS_BASE MSR, plus
+the introduction of LKGS instruction, the SWAPGS instruction is no
+longer needed when FRED is enabled, thus is disallowed (#UD).
+
+Stack levels
+============
+
+4 stack levels 0~3 are introduced to replace the nonreentrant IST for
+event handling, and each stack level should be configured to use a
+dedicated stack.
+
+The current stack level could be unchanged or go higher upon FRED
+event delivery. If unchanged, the CPU keeps using the current event
+stack. If higher, the CPU switches to a new event stack specified by
+the MSR of the new stack level, i.e., MSR_IA32_FRED_RSP[123].
+
+Only execution of a FRED return instruction ERET[US], could lower the
+current stack level, causing the CPU to switch back to the stack it was
+on before a previous event delivery that promoted the stack level.
diff --git a/Documentation/arch/x86/x86_64/index.rst b/Documentation/arch/x86/x86_64/index.rst
index a56070f..ad15e9b 100644
--- a/Documentation/arch/x86/x86_64/index.rst
+++ b/Documentation/arch/x86/x86_64/index.rst
@@ -15,3 +15,4 @@
cpu-hotplug-spec
machinecheck
fsgs
+ fred
diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
index 7985c66..a8f5782 100644
--- a/Documentation/bpf/kfuncs.rst
+++ b/Documentation/bpf/kfuncs.rst
@@ -177,10 +177,10 @@
type of kfunc(s) being registered with the BPF subsystem. To do so, we define
flags on a set of kfuncs as follows::
- BTF_SET8_START(bpf_task_set)
+ BTF_KFUNCS_START(bpf_task_set)
BTF_ID_FLAGS(func, bpf_get_task_pid, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_put_pid, KF_RELEASE)
- BTF_SET8_END(bpf_task_set)
+ BTF_KFUNCS_END(bpf_task_set)
This set encodes the BTF ID of each kfunc listed above, and encodes the flags
along with it. Ofcourse, it is also allowed to specify no flags.
@@ -347,10 +347,10 @@
registering it with the BPF subsystem. Registration is done per BPF program
type. An example is shown below::
- BTF_SET8_START(bpf_task_set)
+ BTF_KFUNCS_START(bpf_task_set)
BTF_ID_FLAGS(func, bpf_get_task_pid, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_put_pid, KF_RELEASE)
- BTF_SET8_END(bpf_task_set)
+ BTF_KFUNCS_END(bpf_task_set)
static const struct btf_kfunc_id_set bpf_task_kfunc_set = {
.owner = THIS_MODULE,
diff --git a/Documentation/bpf/map_lpm_trie.rst b/Documentation/bpf/map_lpm_trie.rst
index 74d64a3..f9cd579 100644
--- a/Documentation/bpf/map_lpm_trie.rst
+++ b/Documentation/bpf/map_lpm_trie.rst
@@ -17,7 +17,7 @@
LPM tries may be created with a maximum prefix length that is a multiple
of 8, in the range from 8 to 2048. The key used for lookup and update
-operations is a ``struct bpf_lpm_trie_key``, extended by
+operations is a ``struct bpf_lpm_trie_key_u8``, extended by
``max_prefixlen/8`` bytes.
- For IPv4 addresses the data length is 4 bytes
diff --git a/Documentation/bpf/standardization/instruction-set.rst b/Documentation/bpf/standardization/instruction-set.rst
index 245b6de..a5ab00a 100644
--- a/Documentation/bpf/standardization/instruction-set.rst
+++ b/Documentation/bpf/standardization/instruction-set.rst
@@ -1,11 +1,11 @@
.. contents::
.. sectnum::
-=======================================
-BPF Instruction Set Specification, v1.0
-=======================================
+======================================
+BPF Instruction Set Architecture (ISA)
+======================================
-This document specifies version 1.0 of the BPF instruction set.
+This document specifies the BPF instruction set architecture (ISA).
Documentation conventions
=========================
@@ -24,22 +24,22 @@
.. table:: Meaning of signedness notation.
==== =========
- `S` Meaning
+ S Meaning
==== =========
- `u` unsigned
- `s` signed
+ u unsigned
+ s signed
==== =========
.. table:: Meaning of bit-width notation.
===== =========
- `N` Bit width
+ N Bit width
===== =========
- `8` 8 bits
- `16` 16 bits
- `32` 32 bits
- `64` 64 bits
- `128` 128 bits
+ 8 8 bits
+ 16 16 bits
+ 32 32 bits
+ 64 64 bits
+ 128 128 bits
===== =========
For example, `u32` is a type whose valid values are all the 32-bit unsigned
@@ -48,31 +48,31 @@
Functions
---------
-* `htobe16`: Takes an unsigned 16-bit number in host-endian format and
+* htobe16: Takes an unsigned 16-bit number in host-endian format and
returns the equivalent number as an unsigned 16-bit number in big-endian
format.
-* `htobe32`: Takes an unsigned 32-bit number in host-endian format and
+* htobe32: Takes an unsigned 32-bit number in host-endian format and
returns the equivalent number as an unsigned 32-bit number in big-endian
format.
-* `htobe64`: Takes an unsigned 64-bit number in host-endian format and
+* htobe64: Takes an unsigned 64-bit number in host-endian format and
returns the equivalent number as an unsigned 64-bit number in big-endian
format.
-* `htole16`: Takes an unsigned 16-bit number in host-endian format and
+* htole16: Takes an unsigned 16-bit number in host-endian format and
returns the equivalent number as an unsigned 16-bit number in little-endian
format.
-* `htole32`: Takes an unsigned 32-bit number in host-endian format and
+* htole32: Takes an unsigned 32-bit number in host-endian format and
returns the equivalent number as an unsigned 32-bit number in little-endian
format.
-* `htole64`: Takes an unsigned 64-bit number in host-endian format and
+* htole64: Takes an unsigned 64-bit number in host-endian format and
returns the equivalent number as an unsigned 64-bit number in little-endian
format.
-* `bswap16`: Takes an unsigned 16-bit number in either big- or little-endian
+* bswap16: Takes an unsigned 16-bit number in either big- or little-endian
format and returns the equivalent number with the same bit width but
opposite endianness.
-* `bswap32`: Takes an unsigned 32-bit number in either big- or little-endian
+* bswap32: Takes an unsigned 32-bit number in either big- or little-endian
format and returns the equivalent number with the same bit width but
opposite endianness.
-* `bswap64`: Takes an unsigned 64-bit number in either big- or little-endian
+* bswap64: Takes an unsigned 64-bit number in either big- or little-endian
format and returns the equivalent number with the same bit width but
opposite endianness.
@@ -97,40 +97,101 @@
A: 10000110
B: 11111111 10000110
+Conformance groups
+------------------
+
+An implementation does not need to support all instructions specified in this
+document (e.g., deprecated instructions). Instead, a number of conformance
+groups are specified. An implementation must support the base32 conformance
+group and may support additional conformance groups, where supporting a
+conformance group means it must support all instructions in that conformance
+group.
+
+The use of named conformance groups enables interoperability between a runtime
+that executes instructions, and tools as such compilers that generate
+instructions for the runtime. Thus, capability discovery in terms of
+conformance groups might be done manually by users or automatically by tools.
+
+Each conformance group has a short ASCII label (e.g., "base32") that
+corresponds to a set of instructions that are mandatory. That is, each
+instruction has one or more conformance groups of which it is a member.
+
+This document defines the following conformance groups:
+
+* base32: includes all instructions defined in this
+ specification unless otherwise noted.
+* base64: includes base32, plus instructions explicitly noted
+ as being in the base64 conformance group.
+* atomic32: includes 32-bit atomic operation instructions (see `Atomic operations`_).
+* atomic64: includes atomic32, plus 64-bit atomic operation instructions.
+* divmul32: includes 32-bit division, multiplication, and modulo instructions.
+* divmul64: includes divmul32, plus 64-bit division, multiplication,
+ and modulo instructions.
+* packet: deprecated packet access instructions.
+
Instruction encoding
====================
BPF has two instruction encodings:
* the basic instruction encoding, which uses 64 bits to encode an instruction
-* the wide instruction encoding, which appends a second 64-bit immediate (i.e.,
- constant) value after the basic instruction for a total of 128 bits.
+* the wide instruction encoding, which appends a second 64 bits
+ after the basic instruction for a total of 128 bits.
-The fields conforming an encoded basic instruction are stored in the
-following order::
+Basic instruction encoding
+--------------------------
- opcode:8 src_reg:4 dst_reg:4 offset:16 imm:32 // In little-endian BPF.
- opcode:8 dst_reg:4 src_reg:4 offset:16 imm:32 // In big-endian BPF.
+A basic instruction is encoded as follows::
-**imm**
- signed integer immediate value
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | opcode | regs | offset |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | imm |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+**opcode**
+ operation to perform, encoded as follows::
+
+ +-+-+-+-+-+-+-+-+
+ |specific |class|
+ +-+-+-+-+-+-+-+-+
+
+ **specific**
+ The format of these bits varies by instruction class
+
+ **class**
+ The instruction class (see `Instruction classes`_)
+
+**regs**
+ The source and destination register numbers, encoded as follows
+ on a little-endian host::
+
+ +-+-+-+-+-+-+-+-+
+ |src_reg|dst_reg|
+ +-+-+-+-+-+-+-+-+
+
+ and as follows on a big-endian host::
+
+ +-+-+-+-+-+-+-+-+
+ |dst_reg|src_reg|
+ +-+-+-+-+-+-+-+-+
+
+ **src_reg**
+ the source register number (0-10), except where otherwise specified
+ (`64-bit immediate instructions`_ reuse this field for other purposes)
+
+ **dst_reg**
+ destination register number (0-10)
**offset**
signed integer offset used with pointer arithmetic
-**src_reg**
- the source register number (0-10), except where otherwise specified
- (`64-bit immediate instructions`_ reuse this field for other purposes)
+**imm**
+ signed integer immediate value
-**dst_reg**
- destination register number (0-10)
-
-**opcode**
- operation to perform
-
-Note that the contents of multi-byte fields ('imm' and 'offset') are
-stored using big-endian byte ordering in big-endian BPF and
-little-endian byte ordering in little-endian BPF.
+Note that the contents of multi-byte fields ('offset' and 'imm') are
+stored using big-endian byte ordering on big-endian hosts and
+little-endian byte ordering on little-endian hosts.
For example::
@@ -143,71 +204,83 @@
Note that most instructions do not use all of the fields.
Unused fields shall be cleared to zero.
-As discussed below in `64-bit immediate instructions`_, a 64-bit immediate
-instruction uses a 64-bit immediate value that is constructed as follows.
-The 64 bits following the basic instruction contain a pseudo instruction
-using the same format but with opcode, dst_reg, src_reg, and offset all set to zero,
-and imm containing the high 32 bits of the immediate value.
+Wide instruction encoding
+--------------------------
+
+Some instructions are defined to use the wide instruction encoding,
+which uses two 32-bit immediate values. The 64 bits following
+the basic instruction format contain a pseudo instruction
+with 'opcode', 'dst_reg', 'src_reg', and 'offset' all set to zero.
This is depicted in the following figure::
- basic_instruction
- .-----------------------------.
- | |
- code:8 regs:8 offset:16 imm:32 unused:32 imm:32
- | |
- '--------------'
- pseudo instruction
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | opcode | regs | offset |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | imm |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | reserved |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | next_imm |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-Thus the 64-bit immediate value is constructed as follows:
+**opcode**
+ operation to perform, encoded as explained above
- imm64 = (next_imm << 32) | imm
+**regs**
+ The source and destination register numbers, encoded as explained above
-where 'next_imm' refers to the imm value of the pseudo instruction
-following the basic instruction. The unused bytes in the pseudo
-instruction are reserved and shall be cleared to zero.
+**offset**
+ signed integer offset used with pointer arithmetic
+
+**imm**
+ signed integer immediate value
+
+**reserved**
+ unused, set to zero
+
+**next_imm**
+ second signed integer immediate value
Instruction classes
-------------------
-The three LSB bits of the 'opcode' field store the instruction class:
+The three least significant bits of the 'opcode' field store the instruction class:
-========= ===== =============================== ===================================
-class value description reference
-========= ===== =============================== ===================================
-BPF_LD 0x00 non-standard load operations `Load and store instructions`_
-BPF_LDX 0x01 load into register operations `Load and store instructions`_
-BPF_ST 0x02 store from immediate operations `Load and store instructions`_
-BPF_STX 0x03 store from register operations `Load and store instructions`_
-BPF_ALU 0x04 32-bit arithmetic operations `Arithmetic and jump instructions`_
-BPF_JMP 0x05 64-bit jump operations `Arithmetic and jump instructions`_
-BPF_JMP32 0x06 32-bit jump operations `Arithmetic and jump instructions`_
-BPF_ALU64 0x07 64-bit arithmetic operations `Arithmetic and jump instructions`_
-========= ===== =============================== ===================================
+===== ===== =============================== ===================================
+class value description reference
+===== ===== =============================== ===================================
+LD 0x0 non-standard load operations `Load and store instructions`_
+LDX 0x1 load into register operations `Load and store instructions`_
+ST 0x2 store from immediate operations `Load and store instructions`_
+STX 0x3 store from register operations `Load and store instructions`_
+ALU 0x4 32-bit arithmetic operations `Arithmetic and jump instructions`_
+JMP 0x5 64-bit jump operations `Arithmetic and jump instructions`_
+JMP32 0x6 32-bit jump operations `Arithmetic and jump instructions`_
+ALU64 0x7 64-bit arithmetic operations `Arithmetic and jump instructions`_
+===== ===== =============================== ===================================
Arithmetic and jump instructions
================================
-For arithmetic and jump instructions (``BPF_ALU``, ``BPF_ALU64``, ``BPF_JMP`` and
-``BPF_JMP32``), the 8-bit 'opcode' field is divided into three parts:
+For arithmetic and jump instructions (``ALU``, ``ALU64``, ``JMP`` and
+``JMP32``), the 8-bit 'opcode' field is divided into three parts::
-============== ====== =================
-4 bits (MSB) 1 bit 3 bits (LSB)
-============== ====== =================
-code source instruction class
-============== ====== =================
+ +-+-+-+-+-+-+-+-+
+ | code |s|class|
+ +-+-+-+-+-+-+-+-+
**code**
the operation code, whose meaning varies by instruction class
-**source**
+**s (source)**
the source operand location, which unless otherwise specified is one of:
====== ===== ==============================================
source value description
====== ===== ==============================================
- BPF_K 0x00 use 32-bit 'imm' value as source operand
- BPF_X 0x08 use 'src_reg' register value as source operand
+ K 0 use 32-bit 'imm' value as source operand
+ X 1 use 'src_reg' register value as source operand
====== ===== ==============================================
**instruction class**
@@ -216,70 +289,75 @@
Arithmetic instructions
-----------------------
-``BPF_ALU`` uses 32-bit wide operands while ``BPF_ALU64`` uses 64-bit wide operands for
-otherwise identical operations.
+``ALU`` uses 32-bit wide operands while ``ALU64`` uses 64-bit wide operands for
+otherwise identical operations. ``ALU64`` instructions belong to the
+base64 conformance group unless noted otherwise.
The 'code' field encodes the operation as below, where 'src' and 'dst' refer
to the values of the source and destination registers, respectively.
-========= ===== ======= ==========================================================
-code value offset description
-========= ===== ======= ==========================================================
-BPF_ADD 0x00 0 dst += src
-BPF_SUB 0x10 0 dst -= src
-BPF_MUL 0x20 0 dst \*= src
-BPF_DIV 0x30 0 dst = (src != 0) ? (dst / src) : 0
-BPF_SDIV 0x30 1 dst = (src != 0) ? (dst s/ src) : 0
-BPF_OR 0x40 0 dst \|= src
-BPF_AND 0x50 0 dst &= src
-BPF_LSH 0x60 0 dst <<= (src & mask)
-BPF_RSH 0x70 0 dst >>= (src & mask)
-BPF_NEG 0x80 0 dst = -dst
-BPF_MOD 0x90 0 dst = (src != 0) ? (dst % src) : dst
-BPF_SMOD 0x90 1 dst = (src != 0) ? (dst s% src) : dst
-BPF_XOR 0xa0 0 dst ^= src
-BPF_MOV 0xb0 0 dst = src
-BPF_MOVSX 0xb0 8/16/32 dst = (s8,s16,s32)src
-BPF_ARSH 0xc0 0 :term:`sign extending<Sign Extend>` dst >>= (src & mask)
-BPF_END 0xd0 0 byte swap operations (see `Byte swap instructions`_ below)
-========= ===== ======= ==========================================================
+===== ===== ======= ==========================================================
+name code offset description
+===== ===== ======= ==========================================================
+ADD 0x0 0 dst += src
+SUB 0x1 0 dst -= src
+MUL 0x2 0 dst \*= src
+DIV 0x3 0 dst = (src != 0) ? (dst / src) : 0
+SDIV 0x3 1 dst = (src != 0) ? (dst s/ src) : 0
+OR 0x4 0 dst \|= src
+AND 0x5 0 dst &= src
+LSH 0x6 0 dst <<= (src & mask)
+RSH 0x7 0 dst >>= (src & mask)
+NEG 0x8 0 dst = -dst
+MOD 0x9 0 dst = (src != 0) ? (dst % src) : dst
+SMOD 0x9 1 dst = (src != 0) ? (dst s% src) : dst
+XOR 0xa 0 dst ^= src
+MOV 0xb 0 dst = src
+MOVSX 0xb 8/16/32 dst = (s8,s16,s32)src
+ARSH 0xc 0 :term:`sign extending<Sign Extend>` dst >>= (src & mask)
+END 0xd 0 byte swap operations (see `Byte swap instructions`_ below)
+===== ===== ======= ==========================================================
Underflow and overflow are allowed during arithmetic operations, meaning
the 64-bit or 32-bit value will wrap. If BPF program execution would
result in division by zero, the destination register is instead set to zero.
-If execution would result in modulo by zero, for ``BPF_ALU64`` the value of
-the destination register is unchanged whereas for ``BPF_ALU`` the upper
+If execution would result in modulo by zero, for ``ALU64`` the value of
+the destination register is unchanged whereas for ``ALU`` the upper
32 bits of the destination register are zeroed.
-``BPF_ADD | BPF_X | BPF_ALU`` means::
+``{ADD, X, ALU}``, where 'code' = ``ADD``, 'source' = ``X``, and 'class' = ``ALU``, means::
dst = (u32) ((u32) dst + (u32) src)
where '(u32)' indicates that the upper 32 bits are zeroed.
-``BPF_ADD | BPF_X | BPF_ALU64`` means::
+``{ADD, X, ALU64}`` means::
dst = dst + src
-``BPF_XOR | BPF_K | BPF_ALU`` means::
+``{XOR, K, ALU}`` means::
- dst = (u32) dst ^ (u32) imm32
+ dst = (u32) dst ^ (u32) imm
-``BPF_XOR | BPF_K | BPF_ALU64`` means::
+``{XOR, K, ALU64}`` means::
- dst = dst ^ imm32
+ dst = dst ^ imm
Note that most instructions have instruction offset of 0. Only three instructions
-(``BPF_SDIV``, ``BPF_SMOD``, ``BPF_MOVSX``) have a non-zero offset.
+(``SDIV``, ``SMOD``, ``MOVSX``) have a non-zero offset.
+Division, multiplication, and modulo operations for ``ALU`` are part
+of the "divmul32" conformance group, and division, multiplication, and
+modulo operations for ``ALU64`` are part of the "divmul64" conformance
+group.
The division and modulo operations support both unsigned and signed flavors.
-For unsigned operations (``BPF_DIV`` and ``BPF_MOD``), for ``BPF_ALU``,
-'imm' is interpreted as a 32-bit unsigned value. For ``BPF_ALU64``,
+For unsigned operations (``DIV`` and ``MOD``), for ``ALU``,
+'imm' is interpreted as a 32-bit unsigned value. For ``ALU64``,
'imm' is first :term:`sign extended<Sign Extend>` from 32 to 64 bits, and then
interpreted as a 64-bit unsigned value.
-For signed operations (``BPF_SDIV`` and ``BPF_SMOD``), for ``BPF_ALU``,
-'imm' is interpreted as a 32-bit signed value. For ``BPF_ALU64``, 'imm'
+For signed operations (``SDIV`` and ``SMOD``), for ``ALU``,
+'imm' is interpreted as a 32-bit signed value. For ``ALU64``, 'imm'
is first :term:`sign extended<Sign Extend>` from 32 to 64 bits, and then
interpreted as a 64-bit signed value.
@@ -291,11 +369,15 @@
a % n = a - n * trunc(a / n)
-The ``BPF_MOVSX`` instruction does a move operation with sign extension.
-``BPF_ALU | BPF_MOVSX`` :term:`sign extends<Sign Extend>` 8-bit and 16-bit operands into 32
+The ``MOVSX`` instruction does a move operation with sign extension.
+``{MOVSX, X, ALU}`` :term:`sign extends<Sign Extend>` 8-bit and 16-bit operands into 32
bit operands, and zeroes the remaining upper 32 bits.
-``BPF_ALU64 | BPF_MOVSX`` :term:`sign extends<Sign Extend>` 8-bit, 16-bit, and 32-bit
-operands into 64 bit operands.
+``{MOVSX, X, ALU64}`` :term:`sign extends<Sign Extend>` 8-bit, 16-bit, and 32-bit
+operands into 64 bit operands. Unlike other arithmetic instructions,
+``MOVSX`` is only defined for register source operands (``X``).
+
+The ``NEG`` instruction is only defined when the source bit is clear
+(``K``).
Shift operations use a mask of 0x3F (63) for 64-bit operations and 0x1F (31)
for 32-bit operations.
@@ -303,43 +385,45 @@
Byte swap instructions
----------------------
-The byte swap instructions use instruction classes of ``BPF_ALU`` and ``BPF_ALU64``
-and a 4-bit 'code' field of ``BPF_END``.
+The byte swap instructions use instruction classes of ``ALU`` and ``ALU64``
+and a 4-bit 'code' field of ``END``.
The byte swap instructions operate on the destination register
only and do not use a separate source register or immediate value.
-For ``BPF_ALU``, the 1-bit source operand field in the opcode is used to
+For ``ALU``, the 1-bit source operand field in the opcode is used to
select what byte order the operation converts from or to. For
-``BPF_ALU64``, the 1-bit source operand field in the opcode is reserved
+``ALU64``, the 1-bit source operand field in the opcode is reserved
and must be set to 0.
-========= ========= ===== =================================================
-class source value description
-========= ========= ===== =================================================
-BPF_ALU BPF_TO_LE 0x00 convert between host byte order and little endian
-BPF_ALU BPF_TO_BE 0x08 convert between host byte order and big endian
-BPF_ALU64 Reserved 0x00 do byte swap unconditionally
-========= ========= ===== =================================================
+===== ======== ===== =================================================
+class source value description
+===== ======== ===== =================================================
+ALU TO_LE 0 convert between host byte order and little endian
+ALU TO_BE 1 convert between host byte order and big endian
+ALU64 Reserved 0 do byte swap unconditionally
+===== ======== ===== =================================================
The 'imm' field encodes the width of the swap operations. The following widths
-are supported: 16, 32 and 64.
+are supported: 16, 32 and 64. Width 64 operations belong to the base64
+conformance group and other swap operations belong to the base32
+conformance group.
Examples:
-``BPF_ALU | BPF_TO_LE | BPF_END`` with imm = 16/32/64 means::
+``{END, TO_LE, ALU}`` with imm = 16/32/64 means::
dst = htole16(dst)
dst = htole32(dst)
dst = htole64(dst)
-``BPF_ALU | BPF_TO_BE | BPF_END`` with imm = 16/32/64 means::
+``{END, TO_BE, ALU}`` with imm = 16/32/64 means::
dst = htobe16(dst)
dst = htobe32(dst)
dst = htobe64(dst)
-``BPF_ALU64 | BPF_TO_LE | BPF_END`` with imm = 16/32/64 means::
+``{END, TO_LE, ALU64}`` with imm = 16/32/64 means::
dst = bswap16(dst)
dst = bswap32(dst)
@@ -348,56 +432,61 @@
Jump instructions
-----------------
-``BPF_JMP32`` uses 32-bit wide operands while ``BPF_JMP`` uses 64-bit wide operands for
-otherwise identical operations.
+``JMP32`` uses 32-bit wide operands and indicates the base32
+conformance group, while ``JMP`` uses 64-bit wide operands for
+otherwise identical operations, and indicates the base64 conformance
+group unless otherwise specified.
The 'code' field encodes the operation as below:
-======== ===== === =========================================== =========================================
-code value src description notes
-======== ===== === =========================================== =========================================
-BPF_JA 0x0 0x0 PC += offset BPF_JMP class
-BPF_JA 0x0 0x0 PC += imm BPF_JMP32 class
-BPF_JEQ 0x1 any PC += offset if dst == src
-BPF_JGT 0x2 any PC += offset if dst > src unsigned
-BPF_JGE 0x3 any PC += offset if dst >= src unsigned
-BPF_JSET 0x4 any PC += offset if dst & src
-BPF_JNE 0x5 any PC += offset if dst != src
-BPF_JSGT 0x6 any PC += offset if dst > src signed
-BPF_JSGE 0x7 any PC += offset if dst >= src signed
-BPF_CALL 0x8 0x0 call helper function by address see `Helper functions`_
-BPF_CALL 0x8 0x1 call PC += imm see `Program-local functions`_
-BPF_CALL 0x8 0x2 call helper function by BTF ID see `Helper functions`_
-BPF_EXIT 0x9 0x0 return BPF_JMP only
-BPF_JLT 0xa any PC += offset if dst < src unsigned
-BPF_JLE 0xb any PC += offset if dst <= src unsigned
-BPF_JSLT 0xc any PC += offset if dst < src signed
-BPF_JSLE 0xd any PC += offset if dst <= src signed
-======== ===== === =========================================== =========================================
+======== ===== ======= =============================== ===================================================
+code value src_reg description notes
+======== ===== ======= =============================== ===================================================
+JA 0x0 0x0 PC += offset {JA, K, JMP} only
+JA 0x0 0x0 PC += imm {JA, K, JMP32} only
+JEQ 0x1 any PC += offset if dst == src
+JGT 0x2 any PC += offset if dst > src unsigned
+JGE 0x3 any PC += offset if dst >= src unsigned
+JSET 0x4 any PC += offset if dst & src
+JNE 0x5 any PC += offset if dst != src
+JSGT 0x6 any PC += offset if dst > src signed
+JSGE 0x7 any PC += offset if dst >= src signed
+CALL 0x8 0x0 call helper function by address {CALL, K, JMP} only, see `Helper functions`_
+CALL 0x8 0x1 call PC += imm {CALL, K, JMP} only, see `Program-local functions`_
+CALL 0x8 0x2 call helper function by BTF ID {CALL, K, JMP} only, see `Helper functions`_
+EXIT 0x9 0x0 return {CALL, K, JMP} only
+JLT 0xa any PC += offset if dst < src unsigned
+JLE 0xb any PC += offset if dst <= src unsigned
+JSLT 0xc any PC += offset if dst < src signed
+JSLE 0xd any PC += offset if dst <= src signed
+======== ===== ======= =============================== ===================================================
-The BPF program needs to store the return value into register R0 before doing a
-``BPF_EXIT``.
+The BPF program needs to store the return value into register R0 before doing an
+``EXIT``.
Example:
-``BPF_JSGE | BPF_X | BPF_JMP32`` (0x7e) means::
+``{JSGE, X, JMP32}`` means::
if (s32)dst s>= (s32)src goto +offset
where 's>=' indicates a signed '>=' comparison.
-``BPF_JA | BPF_K | BPF_JMP32`` (0x06) means::
+``{JA, K, JMP32}`` means::
gotol +imm
where 'imm' means the branch offset comes from insn 'imm' field.
-Note that there are two flavors of ``BPF_JA`` instructions. The
-``BPF_JMP`` class permits a 16-bit jump offset specified by the 'offset'
-field, whereas the ``BPF_JMP32`` class permits a 32-bit jump offset
+Note that there are two flavors of ``JA`` instructions. The
+``JMP`` class permits a 16-bit jump offset specified by the 'offset'
+field, whereas the ``JMP32`` class permits a 32-bit jump offset
specified by the 'imm' field. A > 16-bit conditional jump may be
converted to a < 16-bit conditional jump plus a 32-bit unconditional
jump.
+All ``CALL`` and ``JA`` instructions belong to the
+base32 conformance group.
+
Helper functions
~~~~~~~~~~~~~~~~
@@ -416,78 +505,83 @@
~~~~~~~~~~~~~~~~~~~~~~~
Program-local functions are functions exposed by the same BPF program as the
caller, and are referenced by offset from the call instruction, similar to
-``BPF_JA``. The offset is encoded in the imm field of the call instruction.
-A ``BPF_EXIT`` within the program-local function will return to the caller.
+``JA``. The offset is encoded in the imm field of the call instruction.
+A ``EXIT`` within the program-local function will return to the caller.
Load and store instructions
===========================
-For load and store instructions (``BPF_LD``, ``BPF_LDX``, ``BPF_ST``, and ``BPF_STX``), the
-8-bit 'opcode' field is divided as:
+For load and store instructions (``LD``, ``LDX``, ``ST``, and ``STX``), the
+8-bit 'opcode' field is divided as::
-============ ====== =================
-3 bits (MSB) 2 bits 3 bits (LSB)
-============ ====== =================
-mode size instruction class
-============ ====== =================
+ +-+-+-+-+-+-+-+-+
+ |mode |sz |class|
+ +-+-+-+-+-+-+-+-+
-The mode modifier is one of:
+**mode**
+ The mode modifier is one of:
- ============= ===== ==================================== =============
- mode modifier value description reference
- ============= ===== ==================================== =============
- BPF_IMM 0x00 64-bit immediate instructions `64-bit immediate instructions`_
- BPF_ABS 0x20 legacy BPF packet access (absolute) `Legacy BPF Packet access instructions`_
- BPF_IND 0x40 legacy BPF packet access (indirect) `Legacy BPF Packet access instructions`_
- BPF_MEM 0x60 regular load and store operations `Regular load and store operations`_
- BPF_MEMSX 0x80 sign-extension load operations `Sign-extension load operations`_
- BPF_ATOMIC 0xc0 atomic operations `Atomic operations`_
- ============= ===== ==================================== =============
+ ============= ===== ==================================== =============
+ mode modifier value description reference
+ ============= ===== ==================================== =============
+ IMM 0 64-bit immediate instructions `64-bit immediate instructions`_
+ ABS 1 legacy BPF packet access (absolute) `Legacy BPF Packet access instructions`_
+ IND 2 legacy BPF packet access (indirect) `Legacy BPF Packet access instructions`_
+ MEM 3 regular load and store operations `Regular load and store operations`_
+ MEMSX 4 sign-extension load operations `Sign-extension load operations`_
+ ATOMIC 6 atomic operations `Atomic operations`_
+ ============= ===== ==================================== =============
-The size modifier is one of:
+**sz (size)**
+ The size modifier is one of:
- ============= ===== =====================
- size modifier value description
- ============= ===== =====================
- BPF_W 0x00 word (4 bytes)
- BPF_H 0x08 half word (2 bytes)
- BPF_B 0x10 byte
- BPF_DW 0x18 double word (8 bytes)
- ============= ===== =====================
+ ==== ===== =====================
+ size value description
+ ==== ===== =====================
+ W 0 word (4 bytes)
+ H 1 half word (2 bytes)
+ B 2 byte
+ DW 3 double word (8 bytes)
+ ==== ===== =====================
+
+ Instructions using ``DW`` belong to the base64 conformance group.
+
+**class**
+ The instruction class (see `Instruction classes`_)
Regular load and store operations
---------------------------------
-The ``BPF_MEM`` mode modifier is used to encode regular load and store
+The ``MEM`` mode modifier is used to encode regular load and store
instructions that transfer data between a register and memory.
-``BPF_MEM | <size> | BPF_STX`` means::
+``{MEM, <size>, STX}`` means::
*(size *) (dst + offset) = src
-``BPF_MEM | <size> | BPF_ST`` means::
+``{MEM, <size>, ST}`` means::
- *(size *) (dst + offset) = imm32
+ *(size *) (dst + offset) = imm
-``BPF_MEM | <size> | BPF_LDX`` means::
+``{MEM, <size>, LDX}`` means::
dst = *(unsigned size *) (src + offset)
-Where size is one of: ``BPF_B``, ``BPF_H``, ``BPF_W``, or ``BPF_DW`` and
-'unsigned size' is one of u8, u16, u32 or u64.
+Where '<size>' is one of: ``B``, ``H``, ``W``, or ``DW``, and
+'unsigned size' is one of: u8, u16, u32, or u64.
Sign-extension load operations
------------------------------
-The ``BPF_MEMSX`` mode modifier is used to encode :term:`sign-extension<Sign Extend>` load
+The ``MEMSX`` mode modifier is used to encode :term:`sign-extension<Sign Extend>` load
instructions that transfer data between a register and memory.
-``BPF_MEMSX | <size> | BPF_LDX`` means::
+``{MEMSX, <size>, LDX}`` means::
dst = *(signed size *) (src + offset)
-Where size is one of: ``BPF_B``, ``BPF_H`` or ``BPF_W``, and
-'signed size' is one of s8, s16 or s32.
+Where size is one of: ``B``, ``H``, or ``W``, and
+'signed size' is one of: s8, s16, or s32.
Atomic operations
-----------------
@@ -497,10 +591,12 @@
by other BPF programs or means outside of this specification.
All atomic operations supported by BPF are encoded as store operations
-that use the ``BPF_ATOMIC`` mode modifier as follows:
+that use the ``ATOMIC`` mode modifier as follows:
-* ``BPF_ATOMIC | BPF_W | BPF_STX`` for 32-bit operations
-* ``BPF_ATOMIC | BPF_DW | BPF_STX`` for 64-bit operations
+* ``{ATOMIC, W, STX}`` for 32-bit operations, which are
+ part of the "atomic32" conformance group.
+* ``{ATOMIC, DW, STX}`` for 64-bit operations, which are
+ part of the "atomic64" conformance group.
* 8-bit and 16-bit wide atomic operations are not supported.
The 'imm' field is used to encode the actual atomic operation.
@@ -510,18 +606,18 @@
======== ===== ===========
imm value description
======== ===== ===========
-BPF_ADD 0x00 atomic add
-BPF_OR 0x40 atomic or
-BPF_AND 0x50 atomic and
-BPF_XOR 0xa0 atomic xor
+ADD 0x00 atomic add
+OR 0x40 atomic or
+AND 0x50 atomic and
+XOR 0xa0 atomic xor
======== ===== ===========
-``BPF_ATOMIC | BPF_W | BPF_STX`` with 'imm' = BPF_ADD means::
+``{ATOMIC, W, STX}`` with 'imm' = ADD means::
*(u32 *)(dst + offset) += src
-``BPF_ATOMIC | BPF_DW | BPF_STX`` with 'imm' = BPF ADD means::
+``{ATOMIC, DW, STX}`` with 'imm' = ADD means::
*(u64 *)(dst + offset) += src
@@ -531,20 +627,20 @@
=========== ================ ===========================
imm value description
=========== ================ ===========================
-BPF_FETCH 0x01 modifier: return old value
-BPF_XCHG 0xe0 | BPF_FETCH atomic exchange
-BPF_CMPXCHG 0xf0 | BPF_FETCH atomic compare and exchange
+FETCH 0x01 modifier: return old value
+XCHG 0xe0 | FETCH atomic exchange
+CMPXCHG 0xf0 | FETCH atomic compare and exchange
=========== ================ ===========================
-The ``BPF_FETCH`` modifier is optional for simple atomic operations, and
-always set for the complex atomic operations. If the ``BPF_FETCH`` flag
+The ``FETCH`` modifier is optional for simple atomic operations, and
+always set for the complex atomic operations. If the ``FETCH`` flag
is set, then the operation also overwrites ``src`` with the value that
was in memory before it was modified.
-The ``BPF_XCHG`` operation atomically exchanges ``src`` with the value
+The ``XCHG`` operation atomically exchanges ``src`` with the value
addressed by ``dst + offset``.
-The ``BPF_CMPXCHG`` operation atomically compares the value addressed by
+The ``CMPXCHG`` operation atomically compares the value addressed by
``dst + offset`` with ``R0``. If they match, the value addressed by
``dst + offset`` is replaced with ``src``. In either case, the
value that was at ``dst + offset`` before the operation is zero-extended
@@ -553,25 +649,25 @@
64-bit immediate instructions
-----------------------------
-Instructions with the ``BPF_IMM`` 'mode' modifier use the wide instruction
-encoding defined in `Instruction encoding`_, and use the 'src' field of the
+Instructions with the ``IMM`` 'mode' modifier use the wide instruction
+encoding defined in `Instruction encoding`_, and use the 'src_reg' field of the
basic instruction to hold an opcode subtype.
-The following table defines a set of ``BPF_IMM | BPF_DW | BPF_LD`` instructions
-with opcode subtypes in the 'src' field, using new terms such as "map"
+The following table defines a set of ``{IMM, DW, LD}`` instructions
+with opcode subtypes in the 'src_reg' field, using new terms such as "map"
defined further below:
-========================= ====== === ========================================= =========== ==============
-opcode construction opcode src pseudocode imm type dst type
-========================= ====== === ========================================= =========== ==============
-BPF_IMM | BPF_DW | BPF_LD 0x18 0x0 dst = imm64 integer integer
-BPF_IMM | BPF_DW | BPF_LD 0x18 0x1 dst = map_by_fd(imm) map fd map
-BPF_IMM | BPF_DW | BPF_LD 0x18 0x2 dst = map_val(map_by_fd(imm)) + next_imm map fd data pointer
-BPF_IMM | BPF_DW | BPF_LD 0x18 0x3 dst = var_addr(imm) variable id data pointer
-BPF_IMM | BPF_DW | BPF_LD 0x18 0x4 dst = code_addr(imm) integer code pointer
-BPF_IMM | BPF_DW | BPF_LD 0x18 0x5 dst = map_by_idx(imm) map index map
-BPF_IMM | BPF_DW | BPF_LD 0x18 0x6 dst = map_val(map_by_idx(imm)) + next_imm map index data pointer
-========================= ====== === ========================================= =========== ==============
+======= ========================================= =========== ==============
+src_reg pseudocode imm type dst type
+======= ========================================= =========== ==============
+0x0 dst = (next_imm << 32) | imm integer integer
+0x1 dst = map_by_fd(imm) map fd map
+0x2 dst = map_val(map_by_fd(imm)) + next_imm map fd data pointer
+0x3 dst = var_addr(imm) variable id data pointer
+0x4 dst = code_addr(imm) integer code pointer
+0x5 dst = map_by_idx(imm) map index map
+0x6 dst = map_val(map_by_idx(imm)) + next_imm map index data pointer
+======= ========================================= =========== ==============
where
@@ -609,5 +705,9 @@
-------------------------------------
BPF previously introduced special instructions for access to packet data that were
-carried over from classic BPF. However, these instructions are
-deprecated and should no longer be used.
+carried over from classic BPF. These instructions used an instruction
+class of ``LD``, a size modifier of ``W``, ``H``, or ``B``, and a
+mode modifier of ``ABS`` or ``IND``. The 'dst_reg' and 'offset' fields were
+set to zero, and 'src_reg' was set to zero for ``ABS``. However, these
+instructions are deprecated and should no longer be used. All legacy packet
+access instructions belong to the "packet" conformance group.
diff --git a/Documentation/bpf/verifier.rst b/Documentation/bpf/verifier.rst
index f0ec19d..3568943 100644
--- a/Documentation/bpf/verifier.rst
+++ b/Documentation/bpf/verifier.rst
@@ -562,7 +562,7 @@
* ``checkpoint[0].r1`` is marked as read;
* At instruction #5 exit is reached and ``checkpoint[0]`` can now be processed
- by ``clean_live_states()``. After this processing ``checkpoint[0].r0`` has a
+ by ``clean_live_states()``. After this processing ``checkpoint[0].r1`` has a
read mark and all other registers and stack slots are marked as ``NOT_INIT``
or ``STACK_INVALID``
diff --git a/Documentation/conf.py b/Documentation/conf.py
index 5830b01..d148f3e 100644
--- a/Documentation/conf.py
+++ b/Documentation/conf.py
@@ -346,9 +346,9 @@
html_static_path = ['sphinx-static']
# If true, Docutils "smart quotes" will be used to convert quotes and dashes
-# to typographically correct entities. This will convert "--" to "—",
-# which is not always what we want, so disable it.
-smartquotes = False
+# to typographically correct entities. However, conversion of "--" to "—"
+# is not always what we want, so enable only quotes.
+smartquotes_action = 'q'
# Custom sidebar templates, maps document names to template names.
# Note that the RTD theme ignores this
@@ -388,6 +388,12 @@
verbatimhintsturnover=false,
''',
+ #
+ # Some of our authors are fond of deep nesting; tell latex to
+ # cope.
+ #
+ 'maxlistdepth': '10',
+
# For CJK One-half spacing, need to be in front of hyperref
'extrapackages': r'\usepackage{setspace}',
diff --git a/Documentation/core-api/workqueue.rst b/Documentation/core-api/workqueue.rst
index 3599cf9..ed73c61 100644
--- a/Documentation/core-api/workqueue.rst
+++ b/Documentation/core-api/workqueue.rst
@@ -77,10 +77,12 @@
item pointing to that function and queue that work item on a
workqueue.
-Special purpose threads, called worker threads, execute the functions
-off of the queue, one after the other. If no work is queued, the
-worker threads become idle. These worker threads are managed in so
-called worker-pools.
+A work item can be executed in either a thread or the BH (softirq) context.
+
+For threaded workqueues, special purpose threads, called [k]workers, execute
+the functions off of the queue, one after the other. If no work is queued,
+the worker threads become idle. These worker threads are managed in
+worker-pools.
The cmwq design differentiates between the user-facing workqueues that
subsystems and drivers queue work items on and the backend mechanism
@@ -91,6 +93,12 @@
worker-pools to serve work items queued on unbound workqueues - the
number of these backing pools is dynamic.
+BH workqueues use the same framework. However, as there can only be one
+concurrent execution context, there's no need to worry about concurrency.
+Each per-CPU BH worker pool contains only one pseudo worker which represents
+the BH execution context. A BH workqueue can be considered a convenience
+interface to softirq.
+
Subsystems and drivers can create and queue work items through special
workqueue API functions as they see fit. They can influence some
aspects of the way the work items are executed by setting flags on the
@@ -106,7 +114,7 @@
be queued on the worklist of either normal or highpri worker-pool that
is associated to the CPU the issuer is running on.
-For any worker pool implementation, managing the concurrency level
+For any thread pool implementation, managing the concurrency level
(how many execution contexts are active) is an important issue. cmwq
tries to keep the concurrency at a minimal but sufficient level.
Minimal to save resources and sufficient in that the system is used at
@@ -164,6 +172,17 @@
``flags``
---------
+``WQ_BH``
+ BH workqueues can be considered a convenience interface to softirq. BH
+ workqueues are always per-CPU and all BH work items are executed in the
+ queueing CPU's softirq context in the queueing order.
+
+ All BH workqueues must have 0 ``max_active`` and ``WQ_HIGHPRI`` is the
+ only allowed additional flag.
+
+ BH work items cannot sleep. All other features such as delayed queueing,
+ flushing and canceling are supported.
+
``WQ_UNBOUND``
Work items queued to an unbound wq are served by the special
worker-pools which host workers which are not bound to any
@@ -237,15 +256,11 @@
throttling the number of active work items, specifying '0' is
recommended.
-Some users depend on the strict execution ordering of ST wq. The
-combination of ``@max_active`` of 1 and ``WQ_UNBOUND`` used to
-achieve this behavior. Work items on such wq were always queued to the
-unbound worker-pools and only one work item could be active at any given
-time thus achieving the same ordering property as ST wq.
-
-In the current implementation the above configuration only guarantees
-ST behavior within a given NUMA node. Instead ``alloc_ordered_workqueue()`` should
-be used to achieve system-wide ST behavior.
+Some users depend on strict execution ordering where only one work item
+is in flight at any given time and the work items are processed in
+queueing order. While the combination of ``@max_active`` of 1 and
+``WQ_UNBOUND`` used to achieve this behavior, this is no longer the
+case. Use ``alloc_ordered_queue()`` instead.
Example Execution Scenarios
diff --git a/Documentation/dev-tools/checkpatch.rst b/Documentation/dev-tools/checkpatch.rst
index c3389c6..1279689 100644
--- a/Documentation/dev-tools/checkpatch.rst
+++ b/Documentation/dev-tools/checkpatch.rst
@@ -168,7 +168,7 @@
- --fix
- This is an EXPERIMENTAL feature. If correctable errors exists, a file
+ This is an EXPERIMENTAL feature. If correctable errors exist, a file
<inputfile>.EXPERIMENTAL-checkpatch-fixes is created which has the
automatically fixable errors corrected.
@@ -181,7 +181,7 @@
- --ignore-perl-version
- Override checking of perl version. Runtime errors maybe encountered after
+ Override checking of perl version. Runtime errors may be encountered after
enabling this flag if the perl version does not meet the minimum specified.
- --codespell
diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst
index 858c77f..d7de44f 100644
--- a/Documentation/dev-tools/kasan.rst
+++ b/Documentation/dev-tools/kasan.rst
@@ -169,7 +169,7 @@
A typical KASAN report looks like this::
==================================================================
- BUG: KASAN: slab-out-of-bounds in kmalloc_oob_right+0xa8/0xbc [test_kasan]
+ BUG: KASAN: slab-out-of-bounds in kmalloc_oob_right+0xa8/0xbc [kasan_test]
Write of size 1 at addr ffff8801f44ec37b by task insmod/2760
CPU: 1 PID: 2760 Comm: insmod Not tainted 4.19.0-rc3+ #698
@@ -179,8 +179,8 @@
print_address_description+0x73/0x280
kasan_report+0x144/0x187
__asan_report_store1_noabort+0x17/0x20
- kmalloc_oob_right+0xa8/0xbc [test_kasan]
- kmalloc_tests_init+0x16/0x700 [test_kasan]
+ kmalloc_oob_right+0xa8/0xbc [kasan_test]
+ kmalloc_tests_init+0x16/0x700 [kasan_test]
do_one_initcall+0xa5/0x3ae
do_init_module+0x1b6/0x547
load_module+0x75df/0x8070
@@ -200,8 +200,8 @@
save_stack+0x43/0xd0
kasan_kmalloc+0xa7/0xd0
kmem_cache_alloc_trace+0xe1/0x1b0
- kmalloc_oob_right+0x56/0xbc [test_kasan]
- kmalloc_tests_init+0x16/0x700 [test_kasan]
+ kmalloc_oob_right+0x56/0xbc [kasan_test]
+ kmalloc_tests_init+0x16/0x700 [kasan_test]
do_one_initcall+0xa5/0x3ae
do_init_module+0x1b6/0x547
load_module+0x75df/0x8070
@@ -277,6 +277,27 @@
directly present in the bad access stack trace. Currently, this includes
call_rcu() and workqueue queuing.
+CONFIG_KASAN_EXTRA_INFO
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Enabling CONFIG_KASAN_EXTRA_INFO allows KASAN to record and report more
+information. The extra information currently supported is the CPU number and
+timestamp at allocation and free. More information can help find the cause of
+the bug and correlate the error with other system events, at the cost of using
+extra memory to record more information (more cost details in the help text of
+CONFIG_KASAN_EXTRA_INFO).
+
+Here is the report with CONFIG_KASAN_EXTRA_INFO enabled (only the
+different parts are shown)::
+
+ ==================================================================
+ ...
+ Allocated by task 134 on cpu 5 at 229.133855s:
+ ...
+ Freed by task 136 on cpu 3 at 230.199335s:
+ ...
+ ==================================================================
+
Implementation details
----------------------
@@ -510,15 +531,15 @@
When a test fails due to a failed ``kmalloc``::
- # kmalloc_large_oob_right: ASSERTION FAILED at lib/test_kasan.c:163
+ # kmalloc_large_oob_right: ASSERTION FAILED at mm/kasan/kasan_test.c:245
Expected ptr is not null, but is
- not ok 4 - kmalloc_large_oob_right
+ not ok 5 - kmalloc_large_oob_right
When a test fails due to a missing KASAN report::
- # kmalloc_double_kzfree: EXPECTATION FAILED at lib/test_kasan.c:974
+ # kmalloc_double_kzfree: EXPECTATION FAILED at mm/kasan/kasan_test.c:709
KASAN failure expected in "kfree_sensitive(ptr)", but none occurred
- not ok 44 - kmalloc_double_kzfree
+ not ok 28 - kmalloc_double_kzfree
At the end the cumulative status of all KASAN tests is printed. On success::
@@ -534,7 +555,7 @@
1. Loadable module
With ``CONFIG_KUNIT`` enabled, KASAN-KUnit tests can be built as a loadable
- module and run by loading ``test_kasan.ko`` with ``insmod`` or ``modprobe``.
+ module and run by loading ``kasan_test.ko`` with ``insmod`` or ``modprobe``.
2. Built-In
diff --git a/Documentation/dev-tools/kselftest.rst b/Documentation/dev-tools/kselftest.rst
index ab376b31..ff10dc6e 100644
--- a/Documentation/dev-tools/kselftest.rst
+++ b/Documentation/dev-tools/kselftest.rst
@@ -245,6 +245,10 @@
TEST_PROGS, TEST_GEN_PROGS mean it is the executable tested by
default.
+ TEST_GEN_MODS_DIR should be used by tests that require modules to be built
+ before the test starts. The variable will contain the name of the directory
+ containing the modules.
+
TEST_CUSTOM_PROGS should be used by tests that require custom build
rules and prevent common build rule use.
@@ -255,9 +259,21 @@
TEST_PROGS_EXTENDED, TEST_GEN_PROGS_EXTENDED mean it is the
executable which is not tested by default.
+
TEST_FILES, TEST_GEN_FILES mean it is the file which is used by
test.
+ TEST_INCLUDES is similar to TEST_FILES, it lists files which should be
+ included when exporting or installing the tests, with the following
+ differences:
+
+ * symlinks to files in other directories are preserved
+ * the part of paths below tools/testing/selftests/ is preserved when
+ copying the files to the output directory
+
+ TEST_INCLUDES is meant to list dependencies located in other directories of
+ the selftests hierarchy.
+
* First use the headers inside the kernel source and/or git repo, and then the
system headers. Headers for the kernel release as opposed to headers
installed by the distro on the system should be the primary focus to be able
diff --git a/Documentation/dev-tools/testing-overview.rst b/Documentation/dev-tools/testing-overview.rst
index 0aaf6ea..1619e5e 100644
--- a/Documentation/dev-tools/testing-overview.rst
+++ b/Documentation/dev-tools/testing-overview.rst
@@ -104,6 +104,8 @@
KASAN and can be used in production. See Documentation/dev-tools/kfence.rst
* lockdep is a locking correctness validator. See
Documentation/locking/lockdep-design.rst
+* Runtime Verification (RV) supports checking specific behaviours for a given
+ subsystem. See Documentation/trace/rv/runtime-verification.rst
* There are several other pieces of debug instrumentation in the kernel, many
of which can be found in lib/Kconfig.debug
diff --git a/Documentation/dev-tools/ubsan.rst b/Documentation/dev-tools/ubsan.rst
index 2de7c63..e3591f8 100644
--- a/Documentation/dev-tools/ubsan.rst
+++ b/Documentation/dev-tools/ubsan.rst
@@ -49,34 +49,22 @@
Usage
-----
-To enable UBSAN configure kernel with::
+To enable UBSAN, configure the kernel with::
- CONFIG_UBSAN=y
+ CONFIG_UBSAN=y
-and to check the entire kernel::
-
- CONFIG_UBSAN_SANITIZE_ALL=y
-
-To enable instrumentation for specific files or directories, add a line
-similar to the following to the respective kernel Makefile:
-
-- For a single file (e.g. main.o)::
-
- UBSAN_SANITIZE_main.o := y
-
-- For all files in one directory::
-
- UBSAN_SANITIZE := y
-
-To exclude files from being instrumented even if
-``CONFIG_UBSAN_SANITIZE_ALL=y``, use::
+To exclude files from being instrumented use::
UBSAN_SANITIZE_main.o := n
-and::
+and to exclude all targets in one directory use::
UBSAN_SANITIZE := n
+When disabled for all targets, specific files can be enabled using::
+
+ UBSAN_SANITIZE_main.o := y
+
Detection of unaligned accesses controlled through the separate option -
CONFIG_UBSAN_ALIGNMENT. It's off by default on architectures that support
unaligned accesses (CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y). One could
diff --git a/Documentation/devicetree/bindings/Makefile b/Documentation/devicetree/bindings/Makefile
index 2323fd5..5e08e3a 100644
--- a/Documentation/devicetree/bindings/Makefile
+++ b/Documentation/devicetree/bindings/Makefile
@@ -28,7 +28,10 @@
find_all_cmd = find $(srctree)/$(src) \( -name '*.yaml' ! \
-name 'processed-schema*' \)
-find_cmd = $(find_all_cmd) | sed 's|^$(srctree)/$(src)/||' | grep -F -e "$(subst :," -e ",$(DT_SCHEMA_FILES))" | sed 's|^|$(srctree)/$(src)/|'
+find_cmd = $(find_all_cmd) | \
+ sed 's|^$(srctree)/||' | \
+ grep -F -e "$(subst :," -e ",$(DT_SCHEMA_FILES))" | \
+ sed 's|^|$(srctree)/|'
CHK_DT_DOCS := $(shell $(find_cmd))
quiet_cmd_yamllint = LINT $(src)
@@ -61,9 +64,6 @@
-Wno-unique_unit_address \
-Wunique_unit_address_if_enabled
-# Disable undocumented compatible checks until warning free
-override DT_CHECKER_FLAGS ?=
-
$(obj)/processed-schema.json: $(DT_DOCS) $(src)/.yamllint check_dtschema_version FORCE
$(call if_changed_rule,chkdt)
diff --git a/Documentation/devicetree/bindings/arm/amlogic.yaml b/Documentation/devicetree/bindings/arm/amlogic.yaml
index caab7ce..949537c 100644
--- a/Documentation/devicetree/bindings/arm/amlogic.yaml
+++ b/Documentation/devicetree/bindings/arm/amlogic.yaml
@@ -7,19 +7,11 @@
title: Amlogic SoC based Platforms
maintainers:
+ - Neil Armstrong <neil.armstrong@linaro.org>
+ - Martin Blumenstingl <martin.blumenstingl@googlemail.com>
+ - Jerome Brunet <jbrunet@baylibre.com>
- Kevin Hilman <khilman@baylibre.com>
-description: |+
- Work in progress statement:
-
- Device tree files and bindings applying to Amlogic SoCs and boards are
- considered "unstable". Any Amlogic device tree binding may change at
- any time. Be sure to use a device tree binary and a kernel image
- generated from the same source tree.
-
- Please refer to Documentation/devicetree/bindings/ABI.rst for a definition of a
- stable binding/ABI.
-
properties:
$nodename:
const: '/'
@@ -146,6 +138,7 @@
- enum:
- amediatech,x96-max
- amlogic,u200
+ - freebox,fbx8am
- radxa,zero
- seirobotics,sei510
- const: amlogic,g12a
diff --git a/Documentation/devicetree/bindings/arm/arm,realview.yaml b/Documentation/devicetree/bindings/arm/arm,realview.yaml
index d1bdee9..3c5f168 100644
--- a/Documentation/devicetree/bindings/arm/arm,realview.yaml
+++ b/Documentation/devicetree/bindings/arm/arm,realview.yaml
@@ -10,9 +10,9 @@
- Linus Walleij <linus.walleij@linaro.org>
description: |+
- The ARM RealView series of reference designs were built to explore the ARM
- 11, Cortex A-8 and Cortex A-9 CPUs. This included new features compared to
- the earlier CPUs such as TrustZone and multicore (MPCore).
+ The ARM RealView series of reference designs were built to explore the Arm11,
+ Cortex-A8, and Cortex-A9 CPUs. This included new features compared to the
+ earlier CPUs such as TrustZone and multicore (MPCore).
properties:
$nodename:
diff --git a/Documentation/devicetree/bindings/arm/atmel-at91.yaml b/Documentation/devicetree/bindings/arm/atmel-at91.yaml
index 89d75fb..82f3732 100644
--- a/Documentation/devicetree/bindings/arm/atmel-at91.yaml
+++ b/Documentation/devicetree/bindings/arm/atmel-at91.yaml
@@ -179,6 +179,12 @@
- const: microchip,sama7g5
- const: microchip,sama7
+ - description: Microchip SAMA7G54 Curiosity Board
+ items:
+ - const: microchip,sama7g54-curiosity
+ - const: microchip,sama7g5
+ - const: microchip,sama7
+
- description: Microchip LAN9662 Evaluation Boards.
items:
- enum:
diff --git a/Documentation/devicetree/bindings/arm/fsl.yaml b/Documentation/devicetree/bindings/arm/fsl.yaml
index 228dcc5..0027201 100644
--- a/Documentation/devicetree/bindings/arm/fsl.yaml
+++ b/Documentation/devicetree/bindings/arm/fsl.yaml
@@ -384,7 +384,8 @@
- toradex,apalis_imx6q-ixora # Apalis iMX6Q/D Module on Ixora Carrier Board
- toradex,apalis_imx6q-ixora-v1.1 # Apalis iMX6Q/D Module on Ixora V1.1 Carrier Board
- toradex,apalis_imx6q-ixora-v1.2 # Apalis iMX6Q/D Module on Ixora V1.2 Carrier Board
- - toradex,apalis_imx6q-eval # Apalis iMX6Q/D Module on Apalis Evaluation Board
+ - toradex,apalis_imx6q-eval # Apalis iMX6Q/D Module on Apalis Evaluation Board v1.0/v1.1
+ - toradex,apalis_imx6q-eval-v1.2 # Apalis iMX6Q/D Module on Apalis Evaluation Board v1.2
- const: toradex,apalis_imx6q
- const: fsl,imx6q
@@ -469,6 +470,7 @@
- prt,prtvt7 # Protonic VT7 board
- rex,imx6dl-rex-basic # Rex Basic i.MX6 Dual Lite Board
- riot,imx6s-riotboard # RIoTboard i.MX6S
+ - sielaff,imx6dl-board # Sielaff i.MX6 Solo Board
- skov,imx6dl-skov-revc-lt2 # SKOV IMX6 CPU SoloCore lt2
- skov,imx6dl-skov-revc-lt6 # SKOV IMX6 CPU SoloCore lt6
- solidrun,cubox-i/dl # SolidRun Cubox-i Solo/DualLite
@@ -708,6 +710,7 @@
- toradex,colibri-imx6ull # Colibri iMX6ULL Modules
- toradex,colibri-imx6ull-emmc # Colibri iMX6ULL 1GB (eMMC) Module
- toradex,colibri-imx6ull-wifi # Colibri iMX6ULL Wi-Fi / BT Modules
+ - uni-t,uti260b # UNI-T UTi260B Thermal Camera
- const: fsl,imx6ull
- description: i.MX6ULL Armadeus Systems OPOS6ULDev Board
@@ -1026,7 +1029,7 @@
items:
- enum:
- dimonoff,gateway-evk # i.MX8MN Dimonoff Gateway EVK Board
- - rve,rve-gateway # i.MX8MN RVE Gateway Board
+ - rve,gateway # i.MX8MN RVE Gateway Board
- variscite,var-som-mx8mn-symphony
- const: variscite,var-som-mx8mn
- const: fsl,imx8mn
@@ -1194,7 +1197,8 @@
- description: i.MX8QM Boards with Toradex Apalis iMX8 Modules
items:
- enum:
- - toradex,apalis-imx8-eval # Apalis iMX8 Module on Apalis Evaluation Board
+ - toradex,apalis-imx8-eval # Apalis iMX8 Module on Apalis Evaluation V1.0/V1.1 Board
+ - toradex,apalis-imx8-eval-v1.2 # Apalis iMX8 Module on Apalis Evaluation V1.2 Board
- toradex,apalis-imx8-ixora-v1.1 # Apalis iMX8 Module on Ixora V1.1 Carrier Board
- const: toradex,apalis-imx8
- const: fsl,imx8qm
@@ -1202,7 +1206,8 @@
- description: i.MX8QM Boards with Toradex Apalis iMX8 V1.1 Modules
items:
- enum:
- - toradex,apalis-imx8-v1.1-eval # Apalis iMX8 V1.1 Module on Apalis Eval. Board
+ - toradex,apalis-imx8-v1.1-eval # Apalis iMX8 V1.1 Module on Apalis Eval. V1.0/V1.1 Board
+ - toradex,apalis-imx8-v1.1-eval-v1.2 # Apalis iMX8 V1.1 Module on Apalis Eval. V1.2 Board
- toradex,apalis-imx8-v1.1-ixora-v1.1 # Apalis iMX8 V1.1 Module on Ixora V1.1 C. Board
- toradex,apalis-imx8-v1.1-ixora-v1.2 # Apalis iMX8 V1.1 Module on Ixora V1.2 C. Board
- const: toradex,apalis-imx8-v1.1
@@ -1232,6 +1237,22 @@
- const: toradex,colibri-imx8x
- const: fsl,imx8qxp
+ - description:
+ TQMa8Xx is a series of SOM featuring NXP i.MX8X system-on-chip
+ variants. It is designed to be clicked on different carrier boards
+ MBa8Xx is the starterkit
+ oneOf:
+ - items:
+ - enum:
+ - tq,imx8dxp-tqma8xdp-mba8xx # TQ-Systems GmbH TQMa8XDP SOM on MBa8Xx
+ - const: tq,imx8dxp-tqma8xdp # TQ-Systems GmbH TQMa8XDP SOM (with i.MX8DXP)
+ - const: fsl,imx8dxp
+ - items:
+ - enum:
+ - tq,imx8qxp-tqma8xqp-mba8xx # TQ-Systems GmbH TQMa8XQP SOM on MBa8Xx
+ - const: tq,imx8qxp-tqma8xqp # TQ-Systems GmbH TQMa8XQP SOM (with i.MX8QXP)
+ - const: fsl,imx8qxp
+
- description: i.MX8ULP based Boards
items:
- enum:
@@ -1275,6 +1296,18 @@
- const: tq,imx93-tqma9352 # TQ-Systems GmbH i.MX93 TQMa93xxCA/LA SOM
- const: fsl,imx93
+ - description: PHYTEC phyCORE-i.MX93 SoM based boards
+ items:
+ - const: phytec,imx93-phyboard-segin # phyBOARD-Segin with i.MX93
+ - const: phytec,imx93-phycore-som # phyCORE-i.MX93 SoM
+ - const: fsl,imx93
+
+ - description: Variscite VAR-SOM-MX93 based boards
+ items:
+ - const: variscite,var-som-mx93-symphony
+ - const: variscite,var-som-mx93
+ - const: fsl,imx93
+
- description:
Freescale Vybrid Platform Device Tree Bindings
diff --git a/Documentation/devicetree/bindings/arm/marvell/armada-38x.txt b/Documentation/devicetree/bindings/arm/marvell/armada-38x.txt
deleted file mode 100644
index 202953f..0000000
--- a/Documentation/devicetree/bindings/arm/marvell/armada-38x.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-Marvell Armada 38x Platforms Device Tree Bindings
--------------------------------------------------
-
-Boards with a SoC of the Marvell Armada 38x family shall have the
-following property:
-
-Required root node property:
-
- - compatible: must contain "marvell,armada380"
-
-In addition, boards using the Marvell Armada 385 SoC shall have the
-following property before the previous one:
-
-Required root node property:
-
-compatible: must contain "marvell,armada385"
-
-In addition, boards using the Marvell Armada 388 SoC shall have the
-following property before the previous one:
-
-Required root node property:
-
-compatible: must contain "marvell,armada388"
-
-Example:
-
-compatible = "marvell,a385-rd", "marvell,armada385", "marvell,armada380";
diff --git a/Documentation/devicetree/bindings/arm/marvell/armada-38x.yaml b/Documentation/devicetree/bindings/arm/marvell/armada-38x.yaml
new file mode 100644
index 0000000..cdf805b
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/marvell/armada-38x.yaml
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/marvell/armada-38x.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Marvell Armada 38x Platforms
+
+maintainers:
+ - Gregory CLEMENT <gregory.clement@bootlin.com>
+
+properties:
+ $nodename:
+ const: '/'
+ compatible:
+ oneOf:
+
+ - description:
+ Netgear Armada 380 GS110EM Managed Switch.
+ items:
+ - const: netgear,gs110emx
+ - const: marvell,armada380
+
+ - description:
+ Marvell Armada 385 Development Boards.
+ items:
+ - enum:
+ - marvell,a385-db-amc
+ - marvell,a385-db-ap
+ - const: marvell,armada385
+ - const: marvell,armada380
+
+ - description:
+ SolidRun Armada 385 based single-board computers.
+ items:
+ - enum:
+ - solidrun,clearfog-gtr-l8
+ - solidrun,clearfog-gtr-s4
+ - const: marvell,armada385
+ - const: marvell,armada380
+
+ - description:
+ Kobol Armada 388 based Helios-4 NAS.
+ items:
+ - const: kobol,helios4
+ - const: marvell,armada388
+ - const: marvell,armada385
+ - const: marvell,armada380
+
+ - description:
+ Marvell Armada 388 Development Boards.
+ items:
+ - enum:
+ - marvell,a388-gp
+ - const: marvell,armada388
+ - const: marvell,armada385
+ - const: marvell,armada380
+
+ - description:
+ SolidRun Armada 388 clearfog family single-board computers.
+ items:
+ - enum:
+ - solidrun,clearfog-base-a1
+ - solidrun,clearfog-pro-a1
+ - const: solidrun,clearfog-a1
+ - const: marvell,armada388
+ - const: marvell,armada385
+ - const: marvell,armada380
+
+additionalProperties: true
diff --git a/Documentation/devicetree/bindings/arm/mediatek.yaml b/Documentation/devicetree/bindings/arm/mediatek.yaml
index 6f2f64a..09f9ffd 100644
--- a/Documentation/devicetree/bindings/arm/mediatek.yaml
+++ b/Documentation/devicetree/bindings/arm/mediatek.yaml
@@ -17,6 +17,7 @@
const: '/'
compatible:
oneOf:
+ # Sort by SoC (last) compatible, then board compatible
- items:
- enum:
- mediatek,mt2701-evb
@@ -84,6 +85,11 @@
- const: mediatek,mt7629
- items:
- enum:
+ - xiaomi,ax3000t
+ - const: mediatek,mt7981b
+ - items:
+ - enum:
+ - acelink,ew-7886cax
- bananapi,bpi-r3
- mediatek,mt7986a-rfb
- const: mediatek,mt7986a
@@ -93,6 +99,10 @@
- const: mediatek,mt7986b
- items:
- enum:
+ - bananapi,bpi-r4
+ - const: mediatek,mt7988a
+ - items:
+ - enum:
- mediatek,mt8127-moose
- const: mediatek,mt8127
- items:
@@ -129,75 +139,10 @@
- enum:
- mediatek,mt8173-evb
- const: mediatek,mt8173
- - items:
- - enum:
- - mediatek,mt8183-evb
- - const: mediatek,mt8183
- - description: Google Hayato rev5
- items:
- - const: google,hayato-rev5-sku2
- - const: google,hayato-sku2
- - const: google,hayato
- - const: mediatek,mt8192
- - description: Google Hayato
- items:
- - const: google,hayato-rev1
- - const: google,hayato
- - const: mediatek,mt8192
- - description: Google Spherion rev4 (Acer Chromebook 514)
- items:
- - const: google,spherion-rev4
- - const: google,spherion
- - const: mediatek,mt8192
- - description: Google Spherion (Acer Chromebook 514)
- items:
- - const: google,spherion-rev3
- - const: google,spherion-rev2
- - const: google,spherion-rev1
- - const: google,spherion-rev0
- - const: google,spherion
- - const: mediatek,mt8192
- - description: Acer Tomato (Acer Chromebook Spin 513 CP513-2H)
- items:
- - enum:
- - google,tomato-rev2
- - google,tomato-rev1
- - const: google,tomato
- - const: mediatek,mt8195
- - description: Acer Tomato rev3 - 4 (Acer Chromebook Spin 513 CP513-2H)
- items:
- - const: google,tomato-rev4
- - const: google,tomato-rev3
- - const: google,tomato
- - const: mediatek,mt8195
- - items:
- - enum:
- - mediatek,mt8186-evb
- - const: mediatek,mt8186
- - items:
- - enum:
- - mediatek,mt8188-evb
- - const: mediatek,mt8188
- - items:
- - enum:
- - mediatek,mt8192-evb
- - const: mediatek,mt8192
- - items:
- - enum:
- - mediatek,mt8195-demo
- - mediatek,mt8195-evb
- - const: mediatek,mt8195
- description: Google Burnet (HP Chromebook x360 11MK G3 EE)
items:
- const: google,burnet
- const: mediatek,mt8183
- - description: Google Krane (Lenovo IdeaPad Duet, 10e,...)
- items:
- - enum:
- - google,krane-sku0
- - google,krane-sku176
- - const: google,krane
- - const: mediatek,mt8183
- description: Google Cozmo (Acer Chromebook 314)
items:
- const: google,cozmo
@@ -255,6 +200,13 @@
- google,kodama-sku32
- const: google,kodama
- const: mediatek,mt8183
+ - description: Google Krane (Lenovo IdeaPad Duet, 10e,...)
+ items:
+ - enum:
+ - google,krane-sku0
+ - google,krane-sku176
+ - const: google,krane
+ - const: mediatek,mt8183
- description: Google Makomo (Lenovo 100e Chromebook 2nd Gen MTK 2)
items:
- enum:
@@ -278,8 +230,123 @@
- const: mediatek,mt8183
- items:
- enum:
+ - mediatek,mt8183-evb
+ - const: mediatek,mt8183
+ - items:
+ - enum:
- mediatek,mt8183-pumpkin
- const: mediatek,mt8183
+ - description: Google Magneton (Lenovo IdeaPad Slim 3 Chromebook (14M868))
+ items:
+ - const: google,steelix-sku393219
+ - const: google,steelix-sku393216
+ - const: google,steelix
+ - const: mediatek,mt8186
+ - description: Google Magneton (Lenovo IdeaPad Slim 3 Chromebook (14M868))
+ items:
+ - const: google,steelix-sku393220
+ - const: google,steelix-sku393217
+ - const: google,steelix
+ - const: mediatek,mt8186
+ - description: Google Magneton (Lenovo IdeaPad Slim 3 Chromebook (14M868))
+ items:
+ - const: google,steelix-sku393221
+ - const: google,steelix-sku393218
+ - const: google,steelix
+ - const: mediatek,mt8186
+ - description: Google Rusty (Lenovo 100e Chromebook Gen 4)
+ items:
+ - const: google,steelix-sku196609
+ - const: google,steelix-sku196608
+ - const: google,steelix
+ - const: mediatek,mt8186
+ - description: Google Steelix (Lenovo 300e Yoga Chromebook Gen 4)
+ items:
+ - enum:
+ - google,steelix-sku131072
+ - google,steelix-sku131073
+ - const: google,steelix
+ - const: mediatek,mt8186
+ - description: Google Tentacruel (ASUS Chromebook CM14 Flip CM1402F)
+ items:
+ - const: google,tentacruel-sku262147
+ - const: google,tentacruel-sku262146
+ - const: google,tentacruel-sku262145
+ - const: google,tentacruel-sku262144
+ - const: google,tentacruel
+ - const: mediatek,mt8186
+ - description: Google Tentacruel (ASUS Chromebook CM14 Flip CM1402F)
+ items:
+ - const: google,tentacruel-sku262151
+ - const: google,tentacruel-sku262150
+ - const: google,tentacruel-sku262149
+ - const: google,tentacruel-sku262148
+ - const: google,tentacruel
+ - const: mediatek,mt8186
+ - description: Google Tentacool (ASUS Chromebook CM14 CM1402C)
+ items:
+ - const: google,tentacruel-sku327681
+ - const: google,tentacruel
+ - const: mediatek,mt8186
+ - description: Google Tentacool (ASUS Chromebook CM14 CM1402C)
+ items:
+ - const: google,tentacruel-sku327683
+ - const: google,tentacruel
+ - const: mediatek,mt8186
+ - items:
+ - enum:
+ - mediatek,mt8186-evb
+ - const: mediatek,mt8186
+ - items:
+ - enum:
+ - mediatek,mt8188-evb
+ - const: mediatek,mt8188
+ - description: Google Hayato
+ items:
+ - const: google,hayato-rev1
+ - const: google,hayato
+ - const: mediatek,mt8192
+ - description: Google Hayato rev5
+ items:
+ - const: google,hayato-rev5-sku2
+ - const: google,hayato-sku2
+ - const: google,hayato
+ - const: mediatek,mt8192
+ - description: Google Spherion (Acer Chromebook 514)
+ items:
+ - const: google,spherion-rev3
+ - const: google,spherion-rev2
+ - const: google,spherion-rev1
+ - const: google,spherion-rev0
+ - const: google,spherion
+ - const: mediatek,mt8192
+ - description: Google Spherion rev4 (Acer Chromebook 514)
+ items:
+ - const: google,spherion-rev4
+ - const: google,spherion
+ - const: mediatek,mt8192
+ - items:
+ - enum:
+ - mediatek,mt8192-evb
+ - const: mediatek,mt8192
+ - description: Acer Tomato (Acer Chromebook Spin 513 CP513-2H)
+ items:
+ - enum:
+ - google,tomato-rev2
+ - google,tomato-rev1
+ - const: google,tomato
+ - const: mediatek,mt8195
+ - description: Acer Tomato rev3 - 4 (Acer Chromebook Spin 513 CP513-2H)
+ items:
+ - const: google,tomato-rev4
+ - const: google,tomato-rev3
+ - const: google,tomato
+ - const: mediatek,mt8195
+ - items:
+ - enum:
+ - mediatek,mt8195-demo
+ - mediatek,mt8195-evb
+ - const: mediatek,mt8195
- items:
- enum:
- mediatek,mt8365-evk
@@ -287,6 +354,7 @@
- items:
- enum:
- mediatek,mt8395-evk
+ - radxa,nio-12l
- const: mediatek,mt8395
- const: mediatek,mt8195
- items:
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,hifsys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,hifsys.txt
deleted file mode 100644
index 323905a..0000000
--- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,hifsys.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-Mediatek hifsys controller
-============================
-
-The Mediatek hifsys controller provides various clocks and reset
-outputs to the system.
-
-Required Properties:
-
-- compatible: Should be:
- - "mediatek,mt2701-hifsys", "syscon"
- - "mediatek,mt7622-hifsys", "syscon"
- - "mediatek,mt7623-hifsys", "mediatek,mt2701-hifsys", "syscon"
-- #clock-cells: Must be 1
-
-The hifsys controller uses the common clk binding from
-Documentation/devicetree/bindings/clock/clock-bindings.txt
-The available clocks are defined in dt-bindings/clock/mt*-clk.h.
-
-Example:
-
-hifsys: clock-controller@1a000000 {
- compatible = "mediatek,mt2701-hifsys", "syscon";
- reg = <0 0x1a000000 0 0x1000>;
- #clock-cells = <1>;
- #reset-cells = <1>;
-};
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,pciesys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,pciesys.txt
deleted file mode 100644
index d179a61..0000000
--- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,pciesys.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-MediaTek PCIESYS controller
-============================
-
-The MediaTek PCIESYS controller provides various clocks to the system.
-
-Required Properties:
-
-- compatible: Should be:
- - "mediatek,mt7622-pciesys", "syscon"
- - "mediatek,mt7629-pciesys", "syscon"
-- #clock-cells: Must be 1
-- #reset-cells: Must be 1
-
-The PCIESYS controller uses the common clk binding from
-Documentation/devicetree/bindings/clock/clock-bindings.txt
-The available clocks are defined in dt-bindings/clock/mt*-clk.h.
-
-Example:
-
-pciesys: pciesys@1a100800 {
- compatible = "mediatek,mt7622-pciesys", "syscon";
- reg = <0 0x1a100800 0 0x1000>;
- #clock-cells = <1>;
- #reset-cells = <1>;
-};
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,ssusbsys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,ssusbsys.txt
deleted file mode 100644
index 7cb02c93..0000000
--- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,ssusbsys.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-MediaTek SSUSBSYS controller
-============================
-
-The MediaTek SSUSBSYS controller provides various clocks to the system.
-
-Required Properties:
-
-- compatible: Should be:
- - "mediatek,mt7622-ssusbsys", "syscon"
- - "mediatek,mt7629-ssusbsys", "syscon"
-- #clock-cells: Must be 1
-- #reset-cells: Must be 1
-
-The SSUSBSYS controller uses the common clk binding from
-Documentation/devicetree/bindings/clock/clock-bindings.txt
-The available clocks are defined in dt-bindings/clock/mt*-clk.h.
-
-Example:
-
-ssusbsys: ssusbsys@1a000000 {
- compatible = "mediatek,mt7622-ssusbsys", "syscon";
- reg = <0 0x1a000000 0 0x1000>;
- #clock-cells = <1>;
- #reset-cells = <1>;
-};
diff --git a/Documentation/devicetree/bindings/arm/msm/qcom,saw2.txt b/Documentation/devicetree/bindings/arm/msm/qcom,saw2.txt
deleted file mode 100644
index c0e3c3a..0000000
--- a/Documentation/devicetree/bindings/arm/msm/qcom,saw2.txt
+++ /dev/null
@@ -1,58 +0,0 @@
-SPM AVS Wrapper 2 (SAW2)
-
-The SAW2 is a wrapper around the Subsystem Power Manager (SPM) and the
-Adaptive Voltage Scaling (AVS) hardware. The SPM is a programmable
-power-controller that transitions a piece of hardware (like a processor or
-subsystem) into and out of low power modes via a direct connection to
-the PMIC. It can also be wired up to interact with other processors in the
-system, notifying them when a low power state is entered or exited.
-
-Multiple revisions of the SAW hardware are supported using these Device Nodes.
-SAW2 revisions differ in the register offset and configuration data. Also, the
-same revision of the SAW in different SoCs may have different configuration
-data due the differences in hardware capabilities. Hence the SoC name, the
-version of the SAW hardware in that SoC and the distinction between cpu (big
-or Little) or cache, may be needed to uniquely identify the SAW register
-configuration and initialization data. The compatible string is used to
-indicate this parameter.
-
-PROPERTIES
-
-- compatible:
- Usage: required
- Value type: <string>
- Definition: Must have
- "qcom,saw2"
- A more specific value could be one of:
- "qcom,apq8064-saw2-v1.1-cpu"
- "qcom,msm8226-saw2-v2.1-cpu"
- "qcom,msm8974-saw2-v2.1-cpu"
- "qcom,apq8084-saw2-v2.1-cpu"
-
-- reg:
- Usage: required
- Value type: <prop-encoded-array>
- Definition: the first element specifies the base address and size of
- the register region. An optional second element specifies
- the base address and size of the alias register region.
-
-- regulator:
- Usage: optional
- Value type: boolean
- Definition: Indicates that this SPM device acts as a regulator device
- device for the core (CPU or Cache) the SPM is attached
- to.
-
-Example 1:
-
- power-controller@2099000 {
- compatible = "qcom,saw2";
- reg = <0x02099000 0x1000>, <0x02009000 0x1000>;
- regulator;
- };
-
-Example 2:
- saw0: power-controller@f9089000 {
- compatible = "qcom,apq8084-saw2-v2.1-cpu", "qcom,saw2";
- reg = <0xf9089000 0x1000>, <0xf9009000 0x1000>;
- };
diff --git a/Documentation/devicetree/bindings/arm/qcom,coresight-tpdm.yaml b/Documentation/devicetree/bindings/arm/qcom,coresight-tpdm.yaml
index 61ddc3b..8eec07d 100644
--- a/Documentation/devicetree/bindings/arm/qcom,coresight-tpdm.yaml
+++ b/Documentation/devicetree/bindings/arm/qcom,coresight-tpdm.yaml
@@ -44,14 +44,21 @@
minItems: 1
maxItems: 2
- qcom,dsb-element-size:
+ qcom,dsb-element-bits:
description:
Specifies the DSB(Discrete Single Bit) element size supported by
the monitor. The associated aggregator will read this size before it
is enabled. DSB element size currently only supports 32-bit and 64-bit.
- $ref: /schemas/types.yaml#/definitions/uint8
enum: [32, 64]
+ qcom,cmb-element-bits:
+ description:
+ Specifies the CMB(Continuous Multi-Bit) element size supported by
+ the monitor. The associated aggregator will read this size before it
+ is enabled. CMB element size currently only supports 8-bit, 32-bit
+ and 64-bit.
+ enum: [8, 32, 64]
+
qcom,dsb-msrs-num:
description:
Specifies the number of DSB(Discrete Single Bit) MSR(mux select register)
@@ -61,6 +68,15 @@
minimum: 0
maximum: 32
+ qcom,cmb-msrs-num:
+ description:
+ Specifies the number of CMB MSR(mux select register) registers supported
+ by the monitor. If this property is not configured or set to 0, it means
+ this TPDM doesn't support CMB MSR.
+ $ref: /schemas/types.yaml#/definitions/uint32
+ minimum: 0
+ maximum: 32
+
clocks:
maxItems: 1
@@ -94,7 +110,7 @@
compatible = "qcom,coresight-tpdm", "arm,primecell";
reg = <0x0684c000 0x1000>;
- qcom,dsb-element-size = /bits/ 8 <32>;
+ qcom,dsb-element-bits = <32>;
qcom,dsb-msrs-num = <16>;
clocks = <&aoss_qmp>;
@@ -110,4 +126,22 @@
};
};
+ tpdm@6c29000 {
+ compatible = "qcom,coresight-tpdm", "arm,primecell";
+ reg = <0x06c29000 0x1000>;
+
+ qcom,cmb-element-bits = <64>;
+ qcom,cmb-msrs-num = <32>;
+
+ clocks = <&aoss_qmp>;
+ clock-names = "apb_pclk";
+
+ out-ports {
+ port {
+ tpdm_ipcc_out_funnel_center: endpoint {
+ remote-endpoint = <&funnel_center_in_tpdm_ipcc>;
+ };
+ };
+ };
+ };
...
diff --git a/Documentation/devicetree/bindings/arm/qcom.yaml b/Documentation/devicetree/bindings/arm/qcom.yaml
index 1a5fb88..66beaac 100644
--- a/Documentation/devicetree/bindings/arm/qcom.yaml
+++ b/Documentation/devicetree/bindings/arm/qcom.yaml
@@ -10,17 +10,10 @@
- Bjorn Andersson <bjorn.andersson@linaro.org>
description: |
- Some qcom based bootloaders identify the dtb blob based on a set of
- device properties like SoC and platform and revisions of those components.
- To support this scheme, we encode this information into the board compatible
- string.
-
- Each board must specify a top-level board compatible string with the following
- format:
-
- compatible = "qcom,<SoC>[-<soc_version>][-<foundry_id>]-<board>[/<subtype>][-<board_version>]"
-
- The 'SoC' and 'board' elements are required. All other elements are optional.
+ For devices using the Qualcomm SoC the "compatible" properties consists of
+ one or several "manufacturer,model" strings, describing the device itself,
+ followed by one or several "qcom,<SoC>" strings, describing the SoC used in
+ the device.
The 'SoC' element must be one of the following strings:
@@ -90,43 +83,9 @@
sm8650
x1e80100
- The 'board' element must be one of the following strings:
-
- adp
- cdp
- dragonboard
- idp
- liquid
- mtp
- qcp
- qrd
- rb2
- ride
- sbc
- x100
-
- The 'soc_version' and 'board_version' elements take the form of v<Major>.<Minor>
- where the minor number may be omitted when it's zero, i.e. v1.0 is the same
- as v1. If all versions of the 'board_version' elements match, then a
- wildcard '*' should be used, e.g. 'v*'.
-
- The 'foundry_id' and 'subtype' elements are one or more digits from 0 to 9.
-
- Examples:
-
- "qcom,msm8916-v1-cdp-pm8916-v2.1"
-
- A CDP board with an msm8916 SoC, version 1 paired with a pm8916 PMIC of version
- 2.1.
-
- "qcom,apq8074-v2.0-2-dragonboard/1-v0.1"
-
- A dragonboard board v0.1 of subtype 1 with an apq8074 SoC version 2, made in
- foundry 2.
-
There are many devices in the list below that run the standard ChromeOS
bootloader setup and use the open source depthcharge bootloader to boot the
- OS. These devices do not use the scheme described above. For details, see:
+ OS. These devices use the bootflow explained at
https://docs.kernel.org/arch/arm/google/chromebook-boot-flow.html
properties:
@@ -187,6 +146,7 @@
- microsoft,superman-lte
- microsoft,tesla
- motorola,peregrine
+ - samsung,matisselte
- const: qcom,msm8926
- const: qcom,msm8226
@@ -244,11 +204,15 @@
- samsung,a5u-eur
- samsung,e5
- samsung,e7
+ - samsung,fortuna3g
+ - samsung,gprimeltecan
- samsung,grandmax
+ - samsung,grandprimelte
- samsung,gt510
- samsung,gt58
- samsung,j5
- samsung,j5x
+ - samsung,rossa
- samsung,serranove
- thwc,uf896
- thwc,ufi001c
@@ -988,6 +952,7 @@
- items:
- enum:
+ - xiaomi,curtana
- xiaomi,joyeuse
- const: qcom,sm7125
@@ -1035,6 +1000,7 @@
- items:
- enum:
+ - qcom,sm8550-hdk
- qcom,sm8550-mtp
- qcom,sm8550-qrd
- const: qcom,sm8550
diff --git a/Documentation/devicetree/bindings/arm/rockchip.yaml b/Documentation/devicetree/bindings/arm/rockchip.yaml
index 5cf5cbe..fcf7316 100644
--- a/Documentation/devicetree/bindings/arm/rockchip.yaml
+++ b/Documentation/devicetree/bindings/arm/rockchip.yaml
@@ -37,29 +37,16 @@
- anbernic,rg351v
- const: rockchip,rk3326
- - description: Anbernic RG353P
+ - description: Anbernic RK3566 Handheld Gaming Console
items:
- - const: anbernic,rg353p
- - const: rockchip,rk3566
-
- - description: Anbernic RG353PS
- items:
- - const: anbernic,rg353ps
- - const: rockchip,rk3566
-
- - description: Anbernic RG353V
- items:
- - const: anbernic,rg353v
- - const: rockchip,rk3566
-
- - description: Anbernic RG353VS
- items:
- - const: anbernic,rg353vs
- - const: rockchip,rk3566
-
- - description: Anbernic RG503
- items:
- - const: anbernic,rg503
+ - enum:
+ - anbernic,rg353p
+ - anbernic,rg353ps
+ - anbernic,rg353v
+ - anbernic,rg353vs
+ - anbernic,rg503
+ - anbernic,rg-arc-d
+ - anbernic,rg-arc-s
- const: rockchip,rk3566
- description: Asus Tinker board
@@ -237,6 +224,13 @@
- friendlyarm,nanopi-r5s
- const: rockchip,rk3568
+ - description: FriendlyElec NanoPi R6 series boards
+ items:
+ - enum:
+ - friendlyarm,nanopi-r6c
+ - friendlyarm,nanopi-r6s
+ - const: rockchip,rk3588s
+
- description: FriendlyElec NanoPC T6
items:
- const: friendlyarm,nanopc-t6
@@ -626,9 +620,9 @@
- const: openailab,eaidk-610
- const: rockchip,rk3399
- - description: Orange Pi RK3399 board
+ - description: Xunlong Orange Pi RK3399 board
items:
- - const: rockchip,rk3399-orangepi
+ - const: xunlong,rk3399-orangepi
- const: rockchip,rk3399
- description: Phytec phyCORE-RK3288 Rapid Development Kit
@@ -655,6 +649,14 @@
- const: pine64,pinephone-pro
- const: rockchip,rk3399
+ - description: Pine64 PineTab2
+