aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/ABI/testing/sysfs-ata11
-rw-r--r--Documentation/ABI/testing/sysfs-bus-iio2
-rw-r--r--Documentation/ABI/testing/sysfs-bus-iio-vf6102
-rw-r--r--Documentation/ABI/testing/sysfs-class-net-queues22
-rw-r--r--Documentation/ABI/testing/sysfs-devices-system-cpu10
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-oops_count6
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-warn_count6
-rw-r--r--Documentation/DMA-attributes.txt10
-rw-r--r--Documentation/admin-guide/cgroup-v1/memory.rst13
-rw-r--r--Documentation/admin-guide/hw-vuln/gather_data_sampling.rst109
-rw-r--r--Documentation/admin-guide/hw-vuln/index.rst2
-rw-r--r--Documentation/admin-guide/hw-vuln/processor_mmio_stale_data.rst260
-rw-r--r--Documentation/admin-guide/hw-vuln/spectre.rst39
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt153
-rw-r--r--Documentation/admin-guide/pm/cpuidle.rst15
-rw-r--r--Documentation/admin-guide/security-bugs.rst39
-rw-r--r--Documentation/admin-guide/sysctl/kernel.rst54
-rw-r--r--Documentation/admin-guide/sysctl/net.rst19
-rw-r--r--Documentation/admin-guide/sysctl/vm.rst9
-rw-r--r--Documentation/arm64/silicon-errata.rst7
-rw-r--r--Documentation/atomic_bitops.txt2
-rw-r--r--Documentation/conf.py2
-rw-r--r--Documentation/dev-tools/gdb-kernel-debugging.rst4
-rw-r--r--Documentation/devicetree/bindings/arm/qcom.yaml2
-rw-r--r--Documentation/devicetree/bindings/dma/allwinner,sun50i-a64-dma.yaml2
-rw-r--r--Documentation/devicetree/bindings/dma/moxa,moxart-dma.txt4
-rw-r--r--Documentation/devicetree/bindings/gpio/gpio-altera.txt5
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/ti,sci-intr.txt2
-rw-r--r--Documentation/devicetree/bindings/phy/amlogic,g12a-usb3-pcie-phy.yaml (renamed from Documentation/devicetree/bindings/phy/amlogic,meson-g12a-usb3-pcie-phy.yaml)6
-rw-r--r--Documentation/devicetree/bindings/rtc/allwinner,sun6i-a31-rtc.yaml1
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,wcd9335.txt2
-rw-r--r--Documentation/driver-api/spi.rst4
-rw-r--r--Documentation/fault-injection/fault-injection.rst16
-rw-r--r--Documentation/filesystems/directory-locking.rst37
-rw-r--r--Documentation/filesystems/locking.rst5
-rw-r--r--Documentation/filesystems/porting.rst18
-rw-r--r--Documentation/filesystems/vfs.rst2
-rw-r--r--Documentation/firmware-guide/acpi/apei/einj.rst2
-rw-r--r--Documentation/input/joydev/joystick.rst1
-rw-r--r--Documentation/ioctl/ioctl-number.rst1
-rw-r--r--Documentation/media/kapi/v4l2-dev.rst4
-rw-r--r--Documentation/media/uapi/v4l/subdev-formats.rst27
-rw-r--r--Documentation/networking/af_xdp.rst268
-rw-r--r--Documentation/networking/decnet.txt230
-rw-r--r--Documentation/networking/ip-sysctl.txt13
-rw-r--r--Documentation/power/runtime_pm.rst6
-rw-r--r--Documentation/process/code-of-conduct-interpretation.rst2
-rw-r--r--Documentation/process/submitting-patches.rst2
-rw-r--r--Documentation/sound/hd-audio/models.rst2
-rw-r--r--Documentation/sound/soc/dapm.rst2
-rw-r--r--Documentation/sphinx/load_config.py6
-rw-r--r--Documentation/trace/histogram.rst2
-rw-r--r--Documentation/translations/zh_CN/video4linux/v4l2-framework.txt4
-rw-r--r--Documentation/virt/kvm/api.txt18
-rw-r--r--Documentation/virt/kvm/devices/vm.txt4
-rw-r--r--Documentation/vm/memory-model.rst3
56 files changed, 1073 insertions, 426 deletions
diff --git a/Documentation/ABI/testing/sysfs-ata b/Documentation/ABI/testing/sysfs-ata
index 9ab0ef1dd1c7..299e0d1dc161 100644
--- a/Documentation/ABI/testing/sysfs-ata
+++ b/Documentation/ABI/testing/sysfs-ata
@@ -107,13 +107,14 @@ Description:
described in ATA8 7.16 and 7.17. Only valid if
the device is not a PM.
- pio_mode: (RO) Transfer modes supported by the device when
- in PIO mode. Mostly used by PATA device.
+ pio_mode: (RO) PIO transfer mode used by the device.
+ Mostly used by PATA devices.
- xfer_mode: (RO) Current transfer mode
+ xfer_mode: (RO) Current transfer mode. Mostly used by
+ PATA devices.
- dma_mode: (RO) Transfer modes supported by the device when
- in DMA mode. Mostly used by PATA device.
+ dma_mode: (RO) DMA transfer mode used by the device.
+ Mostly used by PATA devices.
class: (RO) Device class. Can be "ata" for disk,
"atapi" for packet device, "pmp" for PM, or
diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index c3767d4d01a6..4d873e813949 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -138,7 +138,7 @@ Description:
Raw capacitance measurement from channel Y. Units after
application of scale and offset are nanofarads.
-What: /sys/.../iio:deviceX/in_capacitanceY-in_capacitanceZ_raw
+What: /sys/.../iio:deviceX/in_capacitanceY-capacitanceZ_raw
KernelVersion: 3.2
Contact: linux-iio@vger.kernel.org
Description:
diff --git a/Documentation/ABI/testing/sysfs-bus-iio-vf610 b/Documentation/ABI/testing/sysfs-bus-iio-vf610
index 308a6756d3bf..491ead804488 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio-vf610
+++ b/Documentation/ABI/testing/sysfs-bus-iio-vf610
@@ -1,4 +1,4 @@
-What: /sys/bus/iio/devices/iio:deviceX/conversion_mode
+What: /sys/bus/iio/devices/iio:deviceX/in_conversion_mode
KernelVersion: 4.2
Contact: linux-iio@vger.kernel.org
Description:
diff --git a/Documentation/ABI/testing/sysfs-class-net-queues b/Documentation/ABI/testing/sysfs-class-net-queues
index 978b76358661..40d5aab8452d 100644
--- a/Documentation/ABI/testing/sysfs-class-net-queues
+++ b/Documentation/ABI/testing/sysfs-class-net-queues
@@ -1,4 +1,4 @@
-What: /sys/class/<iface>/queues/rx-<queue>/rps_cpus
+What: /sys/class/net/<iface>/queues/rx-<queue>/rps_cpus
Date: March 2010
KernelVersion: 2.6.35
Contact: netdev@vger.kernel.org
@@ -8,7 +8,7 @@ Description:
network device queue. Possible values depend on the number
of available CPU(s) in the system.
-What: /sys/class/<iface>/queues/rx-<queue>/rps_flow_cnt
+What: /sys/class/net/<iface>/queues/rx-<queue>/rps_flow_cnt
Date: April 2010
KernelVersion: 2.6.35
Contact: netdev@vger.kernel.org
@@ -16,7 +16,7 @@ Description:
Number of Receive Packet Steering flows being currently
processed by this particular network device receive queue.
-What: /sys/class/<iface>/queues/tx-<queue>/tx_timeout
+What: /sys/class/net/<iface>/queues/tx-<queue>/tx_timeout
Date: November 2011
KernelVersion: 3.3
Contact: netdev@vger.kernel.org
@@ -24,7 +24,7 @@ Description:
Indicates the number of transmit timeout events seen by this
network interface transmit queue.
-What: /sys/class/<iface>/queues/tx-<queue>/tx_maxrate
+What: /sys/class/net/<iface>/queues/tx-<queue>/tx_maxrate
Date: March 2015
KernelVersion: 4.1
Contact: netdev@vger.kernel.org
@@ -32,7 +32,7 @@ Description:
A Mbps max-rate set for the queue, a value of zero means disabled,
default is disabled.
-What: /sys/class/<iface>/queues/tx-<queue>/xps_cpus
+What: /sys/class/net/<iface>/queues/tx-<queue>/xps_cpus
Date: November 2010
KernelVersion: 2.6.38
Contact: netdev@vger.kernel.org
@@ -42,7 +42,7 @@ Description:
network device transmit queue. Possible vaules depend on the
number of available CPU(s) in the system.
-What: /sys/class/<iface>/queues/tx-<queue>/xps_rxqs
+What: /sys/class/net/<iface>/queues/tx-<queue>/xps_rxqs
Date: June 2018
KernelVersion: 4.18.0
Contact: netdev@vger.kernel.org
@@ -53,7 +53,7 @@ Description:
number of available receive queue(s) in the network device.
Default is disabled.
-What: /sys/class/<iface>/queues/tx-<queue>/byte_queue_limits/hold_time
+What: /sys/class/net/<iface>/queues/tx-<queue>/byte_queue_limits/hold_time
Date: November 2011
KernelVersion: 3.3
Contact: netdev@vger.kernel.org
@@ -62,7 +62,7 @@ Description:
of this particular network device transmit queue.
Default value is 1000.
-What: /sys/class/<iface>/queues/tx-<queue>/byte_queue_limits/inflight
+What: /sys/class/net/<iface>/queues/tx-<queue>/byte_queue_limits/inflight
Date: November 2011
KernelVersion: 3.3
Contact: netdev@vger.kernel.org
@@ -70,7 +70,7 @@ Description:
Indicates the number of bytes (objects) in flight on this
network device transmit queue.
-What: /sys/class/<iface>/queues/tx-<queue>/byte_queue_limits/limit
+What: /sys/class/net/<iface>/queues/tx-<queue>/byte_queue_limits/limit
Date: November 2011
KernelVersion: 3.3
Contact: netdev@vger.kernel.org
@@ -79,7 +79,7 @@ Description:
on this network device transmit queue. This value is clamped
to be within the bounds defined by limit_max and limit_min.
-What: /sys/class/<iface>/queues/tx-<queue>/byte_queue_limits/limit_max
+What: /sys/class/net/<iface>/queues/tx-<queue>/byte_queue_limits/limit_max
Date: November 2011
KernelVersion: 3.3
Contact: netdev@vger.kernel.org
@@ -88,7 +88,7 @@ Description:
queued on this network device transmit queue. See
include/linux/dynamic_queue_limits.h for the default value.
-What: /sys/class/<iface>/queues/tx-<queue>/byte_queue_limits/limit_min
+What: /sys/class/net/<iface>/queues/tx-<queue>/byte_queue_limits/limit_min
Date: November 2011
KernelVersion: 3.3
Contact: netdev@vger.kernel.org
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index c24afa60a30e..08e153614e09 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -480,15 +480,17 @@ Description: information about CPUs heterogeneity.
cpu_capacity: capacity of cpu#.
What: /sys/devices/system/cpu/vulnerabilities
+ /sys/devices/system/cpu/vulnerabilities/gather_data_sampling
+ /sys/devices/system/cpu/vulnerabilities/itlb_multihit
+ /sys/devices/system/cpu/vulnerabilities/l1tf
+ /sys/devices/system/cpu/vulnerabilities/mds
/sys/devices/system/cpu/vulnerabilities/meltdown
+ /sys/devices/system/cpu/vulnerabilities/mmio_stale_data
+ /sys/devices/system/cpu/vulnerabilities/spec_store_bypass
/sys/devices/system/cpu/vulnerabilities/spectre_v1
/sys/devices/system/cpu/vulnerabilities/spectre_v2
- /sys/devices/system/cpu/vulnerabilities/spec_store_bypass
- /sys/devices/system/cpu/vulnerabilities/l1tf
- /sys/devices/system/cpu/vulnerabilities/mds
/sys/devices/system/cpu/vulnerabilities/srbds
/sys/devices/system/cpu/vulnerabilities/tsx_async_abort
- /sys/devices/system/cpu/vulnerabilities/itlb_multihit
Date: January 2018
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
Description: Information about CPU vulnerabilities
diff --git a/Documentation/ABI/testing/sysfs-kernel-oops_count b/Documentation/ABI/testing/sysfs-kernel-oops_count
new file mode 100644
index 000000000000..156cca9dbc96
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-oops_count
@@ -0,0 +1,6 @@
+What: /sys/kernel/oops_count
+Date: November 2022
+KernelVersion: 6.2.0
+Contact: Linux Kernel Hardening List <linux-hardening@vger.kernel.org>
+Description:
+ Shows how many times the system has Oopsed since last boot.
diff --git a/Documentation/ABI/testing/sysfs-kernel-warn_count b/Documentation/ABI/testing/sysfs-kernel-warn_count
new file mode 100644
index 000000000000..90a029813717
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-warn_count
@@ -0,0 +1,6 @@
+What: /sys/kernel/warn_count
+Date: November 2022
+KernelVersion: 6.2.0
+Contact: Linux Kernel Hardening List <linux-hardening@vger.kernel.org>
+Description:
+ Shows how many times the system has Warned since last boot.
diff --git a/Documentation/DMA-attributes.txt b/Documentation/DMA-attributes.txt
index 7193505a98ca..8f8d97f65d73 100644
--- a/Documentation/DMA-attributes.txt
+++ b/Documentation/DMA-attributes.txt
@@ -156,13 +156,3 @@ accesses to DMA buffers in both privileged "supervisor" and unprivileged
subsystem that the buffer is fully accessible at the elevated privilege
level (and ideally inaccessible or at least read-only at the
lesser-privileged levels).
-
-DMA_ATTR_PRIVILEGED
--------------------
-
-Some advanced peripherals such as remote processors and GPUs perform
-accesses to DMA buffers in both privileged "supervisor" and unprivileged
-"user" modes. This attribute is used to indicate to the DMA-mapping
-subsystem that the buffer is fully accessible at the elevated privilege
-level (and ideally inaccessible or at least read-only at the
-lesser-privileged levels).
diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst
index 0ae4f564c2d6..6ec63c239616 100644
--- a/Documentation/admin-guide/cgroup-v1/memory.rst
+++ b/Documentation/admin-guide/cgroup-v1/memory.rst
@@ -82,6 +82,8 @@ Brief summary of control files.
memory.swappiness set/show swappiness parameter of vmscan
(See sysctl's vm.swappiness)
memory.move_charge_at_immigrate set/show controls of moving charges
+ This knob is deprecated and shouldn't be
+ used.
memory.oom_control set/show oom controls.
memory.numa_stat show the number of memory usage per numa
node
@@ -745,8 +747,15 @@ NOTE2:
It is recommended to set the soft limit always below the hard limit,
otherwise the hard limit will take precedence.
-8. Move charges at task migration
-=================================
+8. Move charges at task migration (DEPRECATED!)
+===============================================
+
+THIS IS DEPRECATED!
+
+It's expensive and unreliable! It's better practice to launch workload
+tasks directly from inside their target cgroup. Use dedicated workload
+cgroups to allow fine-grained policy adjustments without having to
+move physical pages between control domains.
Users can move charges associated with a task along with task migration, that
is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
diff --git a/Documentation/admin-guide/hw-vuln/gather_data_sampling.rst b/Documentation/admin-guide/hw-vuln/gather_data_sampling.rst
new file mode 100644
index 000000000000..264bfa937f7d
--- /dev/null
+++ b/Documentation/admin-guide/hw-vuln/gather_data_sampling.rst
@@ -0,0 +1,109 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+GDS - Gather Data Sampling
+==========================
+
+Gather Data Sampling is a hardware vulnerability which allows unprivileged
+speculative access to data which was previously stored in vector registers.
+
+Problem
+-------
+When a gather instruction performs loads from memory, different data elements
+are merged into the destination vector register. However, when a gather
+instruction that is transiently executed encounters a fault, stale data from
+architectural or internal vector registers may get transiently forwarded to the
+destination vector register instead. This will allow a malicious attacker to
+infer stale data using typical side channel techniques like cache timing
+attacks. GDS is a purely sampling-based attack.
+
+The attacker uses gather instructions to infer the stale vector register data.
+The victim does not need to do anything special other than use the vector
+registers. The victim does not need to use gather instructions to be
+vulnerable.
+
+Because the buffers are shared between Hyper-Threads cross Hyper-Thread attacks
+are possible.
+
+Attack scenarios
+----------------
+Without mitigation, GDS can infer stale data across virtually all
+permission boundaries:
+
+ Non-enclaves can infer SGX enclave data
+ Userspace can infer kernel data
+ Guests can infer data from hosts
+ Guest can infer guest from other guests
+ Users can infer data from other users
+
+Because of this, it is important to ensure that the mitigation stays enabled in
+lower-privilege contexts like guests and when running outside SGX enclaves.
+
+The hardware enforces the mitigation for SGX. Likewise, VMMs should ensure
+that guests are not allowed to disable the GDS mitigation. If a host erred and
+allowed this, a guest could theoretically disable GDS mitigation, mount an
+attack, and re-enable it.
+
+Mitigation mechanism
+--------------------
+This issue is mitigated in microcode. The microcode defines the following new
+bits:
+
+ ================================ === ============================
+ IA32_ARCH_CAPABILITIES[GDS_CTRL] R/O Enumerates GDS vulnerability
+ and mitigation support.
+ IA32_ARCH_CAPABILITIES[GDS_NO] R/O Processor is not vulnerable.
+ IA32_MCU_OPT_CTRL[GDS_MITG_DIS] R/W Disables the mitigation
+ 0 by default.
+ IA32_MCU_OPT_CTRL[GDS_MITG_LOCK] R/W Locks GDS_MITG_DIS=0. Writes
+ to GDS_MITG_DIS are ignored
+ Can't be cleared once set.
+ ================================ === ============================
+
+GDS can also be mitigated on systems that don't have updated microcode by
+disabling AVX. This can be done by setting gather_data_sampling="force" or
+"clearcpuid=avx" on the kernel command-line.
+
+If used, these options will disable AVX use by turning off XSAVE YMM support.
+However, the processor will still enumerate AVX support. Userspace that
+does not follow proper AVX enumeration to check both AVX *and* XSAVE YMM
+support will break.
+
+Mitigation control on the kernel command line
+---------------------------------------------
+The mitigation can be disabled by setting "gather_data_sampling=off" or
+"mitigations=off" on the kernel command line. Not specifying either will default
+to the mitigation being enabled. Specifying "gather_data_sampling=force" will
+use the microcode mitigation when available or disable AVX on affected systems
+where the microcode hasn't been updated to include the mitigation.
+
+GDS System Information
+------------------------
+The kernel provides vulnerability status information through sysfs. For
+GDS this can be accessed by the following sysfs file:
+
+/sys/devices/system/cpu/vulnerabilities/gather_data_sampling
+
+The possible values contained in this file are:
+
+ ============================== =============================================
+ Not affected Processor not vulnerable.
+ Vulnerable Processor vulnerable and mitigation disabled.
+ Vulnerable: No microcode Processor vulnerable and microcode is missing
+ mitigation.
+ Mitigation: AVX disabled,
+ no microcode Processor is vulnerable and microcode is missing
+ mitigation. AVX disabled as mitigation.
+ Mitigation: Microcode Processor is vulnerable and mitigation is in
+ effect.
+ Mitigation: Microcode (locked) Processor is vulnerable and mitigation is in
+ effect and cannot be disabled.
+ Unknown: Dependent on
+ hypervisor status Running on a virtual guest processor that is
+ affected but with no way to know if host
+ processor is mitigated or vulnerable.
+ ============================== =============================================
+
+GDS Default mitigation
+----------------------
+The updated microcode will enable the mitigation by default. The kernel's
+default action is to leave the mitigation enabled.
diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst
index ca4dbdd9016d..245468b0f2be 100644
--- a/Documentation/admin-guide/hw-vuln/index.rst
+++ b/Documentation/admin-guide/hw-vuln/index.rst
@@ -15,3 +15,5 @@ are configurable at compile, boot or run time.
tsx_async_abort
multihit.rst
special-register-buffer-data-sampling.rst
+ processor_mmio_stale_data.rst
+ gather_data_sampling.rst
diff --git a/Documentation/admin-guide/hw-vuln/processor_mmio_stale_data.rst b/Documentation/admin-guide/hw-vuln/processor_mmio_stale_data.rst
new file mode 100644
index 000000000000..c98fd11907cc
--- /dev/null
+++ b/Documentation/admin-guide/hw-vuln/processor_mmio_stale_data.rst
@@ -0,0 +1,260 @@
+=========================================
+Processor MMIO Stale Data Vulnerabilities
+=========================================
+
+Processor MMIO Stale Data Vulnerabilities are a class of memory-mapped I/O
+(MMIO) vulnerabilities that can expose data. The sequences of operations for
+exposing data range from simple to very complex. Because most of the
+vulnerabilities require the attacker to have access to MMIO, many environments
+are not affected. System environments using virtualization where MMIO access is
+provided to untrusted guests may need mitigation. These vulnerabilities are
+not transient execution attacks. However, these vulnerabilities may propagate
+stale data into core fill buffers where the data can subsequently be inferred
+by an unmitigated transient execution attack. Mitigation for these
+vulnerabilities includes a combination of microcode update and software
+changes, depending on the platform and usage model. Some of these mitigations
+are similar to those used to mitigate Microarchitectural Data Sampling (MDS) or
+those used to mitigate Special Register Buffer Data Sampling (SRBDS).
+
+Data Propagators
+================
+Propagators are operations that result in stale data being copied or moved from
+one microarchitectural buffer or register to another. Processor MMIO Stale Data
+Vulnerabilities are operations that may result in stale data being directly
+read into an architectural, software-visible state or sampled from a buffer or
+register.
+
+Fill Buffer Stale Data Propagator (FBSDP)
+-----------------------------------------
+Stale data may propagate from fill buffers (FB) into the non-coherent portion
+of the uncore on some non-coherent writes. Fill buffer propagation by itself
+does not make stale data architecturally visible. Stale data must be propagated
+to a location where it is subject to reading or sampling.
+
+Sideband Stale Data Propagator (SSDP)
+-------------------------------------
+The sideband stale data propagator (SSDP) is limited to the client (including
+Intel Xeon server E3) uncore implementation. The sideband response buffer is
+shared by all client cores. For non-coherent reads that go to sideband
+destinations, the uncore logic returns 64 bytes of data to the core, including
+both requested data and unrequested stale data, from a transaction buffer and
+the sideband response buffer. As a result, stale data from the sideband
+response and transaction buffers may now reside in a core fill buffer.
+
+Primary Stale Data Propagator (PSDP)
+------------------------------------
+The primary stale data propagator (PSDP) is limited to the client (including
+Intel Xeon server E3) uncore implementation. Similar to the sideband response
+buffer, the primary response buffer is shared by all client cores. For some
+processors, MMIO primary reads will return 64 bytes of data to the core fill
+buffer including both requested data and unrequested stale data. This is
+similar to the sideband stale data propagator.
+
+Vulnerabilities
+===============
+Device Register Partial Write (DRPW) (CVE-2022-21166)
+-----------------------------------------------------
+Some endpoint MMIO registers incorrectly handle writes that are smaller than
+the register size. Instead of aborting the write or only copying the correct
+subset of bytes (for example, 2 bytes for a 2-byte write), more bytes than
+specified by the write transaction may be written to the register. On
+processors affected by FBSDP, this may expose stale data from the fill buffers
+of the core that created the write transaction.
+
+Shared Buffers Data Sampling (SBDS) (CVE-2022-21125)
+----------------------------------------------------
+After propagators may have moved data around the uncore and copied stale data
+into client core fill buffers, processors affected by MFBDS can leak data from
+the fill buffer. It is limited to the client (including Intel Xeon server E3)
+uncore implementation.
+
+Shared Buffers Data Read (SBDR) (CVE-2022-21123)
+------------------------------------------------
+It is similar to Shared Buffer Data Sampling (SBDS) except that the data is
+directly read into the architectural software-visible state. It is limited to
+the client (including Intel Xeon server E3) uncore implementation.
+
+Affected Processors
+===================
+Not all the CPUs are affected by all the variants. For instance, most
+processors for the server market (excluding Intel Xeon E3 processors) are
+impacted by only Device Register Partial Write (DRPW).
+
+Below is the list of affected Intel processors [#f1]_:
+
+ =================== ============ =========
+ Common name Family_Model Steppings
+ =================== ============ =========
+ HASWELL_X 06_3FH 2,4
+ SKYLAKE_L 06_4EH 3
+ BROADWELL_X 06_4FH All
+ SKYLAKE_X 06_55H 3,4,6,7,11
+ BROADWELL_D 06_56H 3,4,5
+ SKYLAKE 06_5EH 3
+ ICELAKE_X 06_6AH 4,5,6
+ ICELAKE_D 06_6CH 1
+ ICELAKE_L 06_7EH 5
+ ATOM_TREMONT_D 06_86H All
+ LAKEFIELD 06_8AH 1
+ KABYLAKE_L 06_8EH 9 to 12
+ ATOM_TREMONT 06_96H 1
+ ATOM_TREMONT_L 06_9CH 0
+ KABYLAKE 06_9EH 9 to 13
+ COMETLAKE 06_A5H 2,3,5
+ COMETLAKE_L 06_A6H 0,1
+ ROCKETLAKE 06_A7H 1
+ =================== ============ =========
+
+If a CPU is in the affected processor list, but not affected by a variant, it
+is indicated by new bits in MSR IA32_ARCH_CAPABILITIES. As described in a later
+section, mitigation largely remains the same for all the variants, i.e. to
+clear the CPU fill buffers via VERW instruction.
+
+New bits in MSRs
+================
+Newer processors and microcode update on existing affected processors added new
+bits to IA32_ARCH_CAPABILITIES MSR. These bits can be used to enumerate
+specific variants of Processor MMIO Stale Data vulnerabilities and mitigation
+capability.
+
+MSR IA32_ARCH_CAPABILITIES
+--------------------------
+Bit 13 - SBDR_SSDP_NO - When set, processor is not affected by either the
+ Shared Buffers Data Read (SBDR) vulnerability or the sideband stale
+ data propagator (SSDP).
+Bit 14 - FBSDP_NO - When set, processor is not affected by the Fill Buffer
+ Stale Data Propagator (FBSDP).
+Bit 15 - PSDP_NO - When set, processor is not affected by Primary Stale Data
+ Propagator (PSDP).
+Bit 17 - FB_CLEAR - When set, VERW instruction will overwrite CPU fill buffer
+ values as part of MD_CLEAR operations. Processors that do not
+ enumerate MDS_NO (meaning they are affected by MDS) but that do
+ enumerate support for both L1D_FLUSH and MD_CLEAR implicitly enumerate
+ FB_CLEAR as part of their MD_CLEAR support.
+Bit 18 - FB_CLEAR_CTRL - Processor supports read and write to MSR
+ IA32_MCU_OPT_CTRL[FB_CLEAR_DIS]. On such processors, the FB_CLEAR_DIS
+ bit can be set to cause the VERW instruction to not perform the
+ FB_CLEAR action. Not all processors that support FB_CLEAR will support
+ FB_CLEAR_CTRL.
+
+MSR IA32_MCU_OPT_CTRL
+---------------------
+Bit 3 - FB_CLEAR_DIS - When set, VERW instruction does not perform the FB_CLEAR
+action. This may be useful to reduce the performance impact of FB_CLEAR in
+cases where system software deems it warranted (for example, when performance
+is more critical, or the untrusted software has no MMIO access). Note that
+FB_CLEAR_DIS has no impact on enumeration (for example, it does not change
+FB_CLEAR or MD_CLEAR enumeration) and it may not be supported on all processors
+that enumerate FB_CLEAR.
+
+Mitigation
+==========
+Like MDS, all variants of Processor MMIO Stale Data vulnerabilities have the
+same mitigation strategy to force the CPU to clear the affected buffers before
+an attacker can extract the secrets.
+
+This is achieved by using the otherwise unused and obsolete VERW instruction in
+combination with a microcode update. The microcode clears the affected CPU
+buffers when the VERW instruction is executed.
+
+Kernel reuses the MDS function to invoke the buffer clearing:
+
+ mds_clear_cpu_buffers()
+
+On MDS affected CPUs, the kernel already invokes CPU buffer clear on
+kernel/userspace, hypervisor/guest and C-state (idle) transitions. No
+additional mitigation is needed on such CPUs.
+
+For CPUs not affected by MDS or TAA, mitigation is needed only for the attacker
+with MMIO capability. Therefore, VERW is not required for kernel/userspace. For
+virtualization case, VERW is only needed at VMENTER for a guest with MMIO
+capability.
+
+Mitigation points
+-----------------
+Return to user space
+^^^^^^^^^^^^^^^^^^^^
+Same mitigation as MDS when affected by MDS/TAA, otherwise no mitigation
+needed.
+
+C-State transition
+^^^^^^^^^^^^^^^^^^
+Control register writes by CPU during C-state transition can propagate data
+from fill buffer to uncore buffers. Execute VERW before C-state transition to
+clear CPU fill buffers.
+
+Guest entry point
+^^^^^^^^^^^^^^^^^
+Same mitigation as MDS when processor is also affected by MDS/TAA, otherwise
+execute VERW at VMENTER only for MMIO capable guests. On CPUs not affected by
+MDS/TAA, guest without MMIO access cannot extract secrets using Processor MMIO
+Stale Data vulnerabilities, so there is no need to execute VERW for such guests.
+
+Mitigation control on the kernel command line
+---------------------------------------------
+The kernel command line allows to control the Processor MMIO Stale Data
+mitigations at boot time with the option "mmio_stale_data=". The valid
+arguments for this option are:
+
+ ========== =================================================================
+ full If the CPU is vulnerable, enable mitigation; CPU buffer clearing
+ on exit to userspace and when entering a VM. Idle transitions are
+ protected as well. It does not automatically disable SMT.
+ full,nosmt Same as full, with SMT disabled on vulnerable CPUs. This is the
+ complete mitigation.
+ off Disables mitigation completely.
+ ========== =================================================================
+
+If the CPU is affected and mmio_stale_data=off is not supplied on the kernel
+command line, then the kernel selects the appropriate mitigation.
+
+Mitigation status information
+-----------------------------
+The Linux kernel provides a sysfs interface to enumerate the current
+vulnerability status of the system: whether the system is vulnerable, and
+which mitigations are active. The relevant sysfs file is:
+
+ /sys/devices/system/cpu/vulnerabilities/mmio_stale_data
+
+The possible values in this file are:
+
+ .. list-table::
+
+ * - 'Not affected'
+ - The processor is not vulnerable
+ * - 'Vulnerable'
+ - The processor is vulnerable, but no mitigation enabled
+ * - 'Vulnerable: Clear CPU buffers attempted, no microcode'
+ - The processor is vulnerable, but microcode is not updated. The
+ mitigation is enabled on a best effort basis.
+ * - 'Mitigation: Clear CPU buffers'
+ - The processor is vulnerable and the CPU buffer clearing mitigation is
+ enabled.
+ * - 'Unknown: No mitigations'
+ - The processor vulnerability status is unknown because it is
+ out of Servicing period. Mitigation is not attempted.
+
+Definitions:
+------------
+
+Servicing period: The process of providing functional and security updates to
+Intel processors or platforms, utilizing the Intel Platform Update (IPU)
+process or other similar mechanisms.
+
+End of Servicing Updates (ESU): ESU is the date at which Intel will no
+longer provide Servicing, such as through IPU or other similar update
+processes. ESU dates will typically be aligned to end of quarter.
+
+If the processor is vulnerable then the following information is appended to
+the above information:
+
+ ======================== ===========================================
+ 'SMT vulnerable' SMT is enabled
+ 'SMT disabled' SMT is disabled
+ 'SMT Host state unknown' Kernel runs in a VM, Host SMT state unknown
+ ======================== ===========================================
+
+References
+----------
+.. [#f1] Affected Processors
+ https://www.intel.com/content/www/us/en/developer/topic-technology/software-security-guidance/processors-affected-consolidated-product-cpu-model.html
diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst
index 6bd97cd50d62..305600351209 100644
--- a/Documentation/admin-guide/hw-vuln/spectre.rst
+++ b/Documentation/admin-guide/hw-vuln/spectre.rst
@@ -422,6 +422,14 @@ The possible values in this file are:
'RSB filling' Protection of RSB on context switch enabled
============= ===========================================
+ - EIBRS Post-barrier Return Stack Buffer (PBRSB) protection status:
+
+ =========================== =======================================================
+ 'PBRSB-eIBRS: SW sequence' CPU is affected and protection of RSB on VMEXIT enabled
+ 'PBRSB-eIBRS: Vulnerable' CPU is vulnerable
+ 'PBRSB-eIBRS: Not affected' CPU is not affected by PBRSB
+ =========================== =======================================================
+
Full mitigation might require a microcode update from the CPU
vendor. When the necessary microcode is not available, the kernel will
report vulnerability.
@@ -471,8 +479,19 @@ Spectre variant 2
On Intel Skylake-era systems the mitigation covers most, but not all,
cases. See :ref:`[3] <spec_ref3>` for more details.
- On CPUs with hardware mitigation for Spectre variant 2 (e.g. Enhanced
- IBRS on x86), retpoline is automatically disabled at run time.
+ On CPUs with hardware mitigation for Spectre variant 2 (e.g. IBRS
+ or enhanced IBRS on x86), retpoline is automatically disabled at run time.
+
+ Systems which support enhanced IBRS (eIBRS) enable IBRS protection once at
+ boot, by setting the IBRS bit, and they're automatically protected against
+ Spectre v2 variant attacks.
+
+ On Intel's enhanced IBRS systems, this includes cross-thread branch target
+ injections on SMT systems (STIBP). In other words, Intel eIBRS enables
+ STIBP, too.
+
+ AMD Automatic IBRS does not protect userspace, and Legacy IBRS systems clear
+ the IBRS bit on exit to userspace, therefore both explicitly enable STIBP.
The retpoline mitigation is turned on by default on vulnerable
CPUs. It can be forced on or off by the administrator
@@ -496,9 +515,12 @@ Spectre variant 2
For Spectre variant 2 mitigation, individual user programs
can be compiled with return trampolines for indirect branches.
This protects them from consuming poisoned entries in the branch
- target buffer left by malicious software. Alternatively, the
- programs can disable their indirect branch speculation via prctl()
- (See :ref:`Documentation/userspace-api/spec_ctrl.rst <set_spec_ctrl>`).
+ target buffer left by malicious software.
+
+ On legacy IBRS systems, at return to userspace, implicit STIBP is disabled
+ because the kernel clears the IBRS bit. In this case, the userspace programs
+ can disable indirect branch speculation via prctl() (See
+ :ref:`Documentation/userspace-api/spec_ctrl.rst <set_spec_ctrl>`).
On x86, this will turn on STIBP to guard against attacks from the
sibling thread when the user program is running, and use IBPB to
flush the branch target buffer when switching to/from the program.
@@ -603,9 +625,10 @@ kernel command line.
retpoline,generic Retpolines
retpoline,lfence LFENCE; indirect branch
retpoline,amd alias for retpoline,lfence
- eibrs enhanced IBRS
- eibrs,retpoline enhanced IBRS + Retpolines
- eibrs,lfence enhanced IBRS + LFENCE
+ eibrs Enhanced/Auto IBRS
+ eibrs,retpoline Enhanced/Auto IBRS + Retpolines
+ eibrs,lfence Enhanced/Auto IBRS + LFENCE
+ ibrs use IBRS to protect kernel
Not specifying this option is equivalent to
spectre_v2=auto.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 104332b8ed0d..b6785a13eeec 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -825,10 +825,6 @@
debugpat [X86] Enable PAT debugging
- decnet.addr= [HW,NET]
- Format: <area>[,<node>]
- See also Documentation/networking/decnet.txt.
-
default_hugepagesz=
[same as hugepagesz=] The size of the default
HugeTLB page size. This is the size represented by
@@ -1340,6 +1336,26 @@
Format: off | on
default: on
+ gather_data_sampling=
+ [X86,INTEL] Control the Gather Data Sampling (GDS)
+ mitigation.
+
+ Gather Data Sampling is a hardware vulnerability which
+ allows unprivileged speculative access to data which was
+ previously stored in vector registers.
+
+ This issue is mitigated by default in updated microcode.
+ The mitigation may have a performance impact but can be
+ disabled. On systems without the microcode mitigation
+ disabling AVX serves as a mitigation.
+
+ force: Disable AVX to mitigate systems without
+ microcode mitigation. No effect if the microcode
+ mitigation is present. Known to cause crashes in
+ userspace with buggy AVX enumeration.
+
+ off: Disable GDS mitigation.
+
gcov_persist= [GCOV] When non-zero (default), profiling data for
kernel modules is saved and remains accessible via
debugfs, even when the module is unloaded/reloaded.
@@ -1944,24 +1960,57 @@
ivrs_ioapic [HW,X86_64]
Provide an override to the IOAPIC-ID<->DEVICE-ID
- mapping provided in the IVRS ACPI table. For
- example, to map IOAPIC-ID decimal 10 to
- PCI device 00:14.0 write the parameter as:
+ mapping provided in the IVRS ACPI table.
+ By default, PCI segment is 0, and can be omitted.
+
+ For example, to map IOAPIC-ID decimal 10 to
+ PCI segment 0x1 and PCI device 00:14.0,
+ write the parameter as:
+ ivrs_ioapic=10@0001:00:14.0
+
+ Deprecated formats:
+ * To map IOAPIC-ID decimal 10 to PCI device 00:14.0
+ write the parameter as:
ivrs_ioapic[10]=00:14.0
+ * To map IOAPIC-ID decimal 10 to PCI segment 0x1 and
+ PCI device 00:14.0 write the parameter as:
+ ivrs_ioapic[10]=0001:00:14.0
ivrs_hpet [HW,X86_64]
Provide an override to the HPET-ID<->DEVICE-ID
- mapping provided in the IVRS ACPI table. For
- example, to map HPET-ID decimal 0 to
- PCI device 00:14.0 write the parameter as:
+ mapping provided in the IVRS ACPI table.
+ By default, PCI segment is 0, and can be omitted.
+
+ For example, to map HPET-ID decimal 10 to
+ PCI segment 0x1 and PCI device 00:14.0,
+ write the parameter as:
+ ivrs_hpet=10@0001:00:14.0
+
+ Deprecated formats:
+ * To map HPET-ID decimal 0 to PCI device 00:14.0
+ write the parameter as:
ivrs_hpet[0]=00:14.0
+ * To map HPET-ID decimal 10 to PCI segment 0x1 and
+ PCI device 00:14.0 write the parameter as:
+ ivrs_ioapic[10]=0001:00:14.0
ivrs_acpihid [HW,X86_64]
Provide an override to the ACPI-HID:UID<->DEVICE-ID
- mapping provided in the IVRS ACPI table. For
- example, to map UART-HID:UID AMD0020:0 to
- PCI device 00:14.5 write the parameter as:
+ mapping provided in the IVRS ACPI table.
+ By default, PCI segment is 0, and can be omitted.
+
+ For example, to map UART-HID:UID AMD0020:0 to
+ PCI segment 0x1 and PCI device ID 00:14.5,
+ write the parameter as:
+ ivrs_acpihid=AMD0020:0@0001:00:14.5
+
+ Deprecated formats:
+ * To map UART-HID:UID AMD0020:0 to PCI segment is 0,
+ PCI device ID 00:14.5, write the parameter as:
ivrs_acpihid[00:14.5]=AMD0020:0
+ * To map UART-HID:UID AMD0020:0 to PCI segment 0x1 and
+ PCI device ID 00:14.5, write the parameter as:
+ ivrs_acpihid[0001:00:14.5]=AMD0020:0
js= [HW,JOY] Analog joystick
See Documentation/input/joydev/joystick.rst.
@@ -2667,20 +2716,22 @@
Disable all optional CPU mitigations. This
improves system performance, but it may also
expose users to several CPU vulnerabilities.
- Equivalent to: nopti [X86,PPC]
+ Equivalent to: gather_data_sampling=off [X86]
kpti=0 [ARM64]
- nospectre_v1 [X86,PPC]
+ kvm.nx_huge_pages=off [X86]
+ l1tf=off [X86]
+ mds=off [X86]
+ mmio_stale_data=off [X86]
+ no_entry_flush [PPC]
+ no_uaccess_flush [PPC]
nobp=0 [S390]
+ nopti [X86,PPC]
+ nospectre_v1 [X86,PPC]
nospectre_v2 [X86,PPC,S390,ARM64]
- spectre_v2_user=off [X86]
spec_store_bypass_disable=off [X86,PPC]
+ spectre_v2_user=off [X86]
ssbd=force-off [ARM64]
- l1tf=off [X86]
- mds=off [X86]
tsx_async_abort=off [X86]
- kvm.nx_huge_pages=off [X86]
- no_entry_flush [PPC]
- no_uaccess_flush [PPC]
Exceptions:
This does not have any effect on
@@ -2702,6 +2753,7 @@
Equivalent to: l1tf=flush,nosmt [X86]
mds=full,nosmt [X86]
tsx_async_abort=full,nosmt [X86]
+ mmio_stale_data=full,nosmt [X86]
mminit_loglevel=
[KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this
@@ -2711,6 +2763,40 @@
log everything. Information is printed at KERN_DEBUG
so loglevel=8 may also need to be specified.
+ mmio_stale_data=
+ [X86,INTEL] Control mitigation for the Processor
+ MMIO Stale Data vulnerabilities.
+
+ Processor MMIO Stale Data is a class of
+ vulnerabilities that may expose data after an MMIO
+ operation. Exposed data could originate or end in
+ the same CPU buffers as affected by MDS and TAA.
+ Therefore, similar to MDS and TAA, the mitigation
+ is to clear the affected CPU buffers.
+
+ This parameter controls the mitigation. The
+ options are:
+
+ full - Enable mitigation on vulnerable CPUs
+
+ full,nosmt - Enable mitigation and disable SMT on
+ vulnerable CPUs.
+
+ off - Unconditionally disable mitigation
+
+ On MDS or TAA affected machines,
+ mmio_stale_data=off can be prevented by an active
+ MDS or TAA mitigation as these vulnerabilities are
+ mitigated with the same mechanism so in order to
+ disable this mitigation, you need to specify
+ mds=off and tsx_async_abort=off too.
+
+ Not specifying this option is equivalent to
+ mmio_stale_data=full.
+
+ For details see:
+ Documentation/admin-guide/hw-vuln/processor_mmio_stale_data.rst
+
module.sig_enforce
[KNL] When CONFIG_MODULE_SIG is set, this means that
modules without (valid) signatures will fail to load.
@@ -3806,6 +3892,12 @@
fully seed the kernel's CRNG. Default is controlled
by CONFIG_RANDOM_TRUST_CPU.
+ random.trust_bootloader={on,off}
+ [KNL] Enable or disable trusting the use of a
+ seed passed by the bootloader (if available) to
+ fully seed the kernel's CRNG. Default is controlled
+ by CONFIG_RANDOM_TRUST_BOOTLOADER.
+
ras=option[,option,...] [KNL] RAS-specific options
cec_disable [X86]
@@ -4256,6 +4348,18 @@
retain_initrd [RAM] Keep initrd memory after extraction
+ retbleed= [X86] Control mitigation of RETBleed (Arbitrary
+ Speculative Code Execution with Return Instructions)
+ vulnerability.
+
+ off - unconditionally disable
+ auto - automatically select a migitation
+
+ Selecting 'auto' will choose a mitigation method at run
+ time according to the CPU.
+
+ Not specifying this option is equivalent to retbleed=auto.
+
rfkill.default_state=
0 "airplane mode". All wifi, bluetooth, wimax, gps, fm,
etc. communication is blocked by default.
@@ -4496,9 +4600,10 @@
retpoline,generic - Retpolines
retpoline,lfence - LFENCE; indirect branch
retpoline,amd - alias for retpoline,lfence
- eibrs - enhanced IBRS
- eibrs,retpoline - enhanced IBRS + Retpolines
- eibrs,lfence - enhanced IBRS + LFENCE
+ eibrs - Enhanced/Auto IBRS
+ eibrs,retpoline - Enhanced/Auto IBRS + Retpolines
+ eibrs,lfence - Enhanced/Auto IBRS + LFENCE
+ ibrs - use IBRS to protect kernel
Not specifying this option is equivalent to
spectre_v2=auto.
diff --git a/Documentation/admin-guide/pm/cpuidle.rst b/Documentation/admin-guide/pm/cpuidle.rst
index e70b365dbc60..80cf2ef2a506 100644
--- a/Documentation/admin-guide/pm/cpuidle.rst
+++ b/Documentation/admin-guide/pm/cpuidle.rst
@@ -676,8 +676,8 @@ the ``menu`` governor to be used on the systems that use the ``ladder`` governor
by default this way, for example.
The other kernel command line parameters controlling CPU idle time management
-described below are only relevant for the *x86* architecture and some of
-them affect Intel processors only.
+described below are only relevant for the *x86* architecture and references
+to ``intel_idle`` affect Intel processors only.
The *x86* architecture support code recognizes three kernel command line
options related to CPU idle time management: ``idle=poll``, ``idle=halt``,
@@ -699,10 +699,13 @@ idle, so it very well may hurt single-thread computations performance as well as
energy-efficiency. Thus using it for performance reasons may not be a good idea
at all.]
-The ``idle=nomwait`` option disables the ``intel_idle`` driver and causes
-``acpi_idle`` to be used (as long as all of the information needed by it is
-there in the system's ACPI tables), but it is not allowed to use the
-``MWAIT`` instruction of the CPUs to ask the hardware to enter idle states.
+The ``idle=nomwait`` option prevents the use of ``MWAIT`` instruction of
+the CPU to enter idle states. When this option is used, the ``acpi_idle``
+driver will use the ``HLT`` instruction instead of ``MWAIT``. On systems
+running Intel processors, this option disables the ``intel_idle`` driver
+and forces the use of the ``acpi_idle`` driver instead. Note that in either
+case, ``acpi_idle`` driver will function only if all the information needed
+by it is in the system's ACPI tables.
In addition to the architecture-level kernel command line options affecting CPU
idle time management, there are parameters affecting individual ``CPUIdle``
diff --git a/Documentation/admin-guide/security-bugs.rst b/Documentation/admin-guide/security-bugs.rst
index dcd6c93c7aac..521acda367ae 100644
--- a/Documentation/admin-guide/security-bugs.rst
+++ b/Documentation/admin-guide/security-bugs.rst
@@ -56,31 +56,28 @@ information submitted to the security list and any followup discussions
of the report are treated confidentially even after the embargo has been
lifted, in perpetuity.
-Coordination
-------------
-
-Fixes for sensitive bugs, such as those that might lead to privilege
-escalations, may need to be coordinated with the private
-<linux-distros@vs.openwall.org> mailing list so that distribution vendors
-are well prepared to issue a fixed kernel upon public disclosure of the
-upstream fix. Distros will need some time to test the proposed patch and
-will generally request at least a few days of embargo, and vendor update
-publication prefers to happen Tuesday through Thursday. When appropriate,
-the security team can assist with this coordination, or the reporter can
-include linux-distros from the start. In this case, remember to prefix
-the email Subject line with "[vs]" as described in the linux-distros wiki:
-<http://oss-security.openwall.org/wiki/mailing-lists/distros#how-to-use-the-lists>
+Coordination with other groups
+------------------------------
+
+The kernel security team strongly recommends that reporters of potential
+security issues NEVER contact the "linux-distros" mailing list until
+AFTER discussing it with the kernel security team. Do not Cc: both
+lists at once. You may contact the linux-distros mailing list after a
+fix has been agreed on and you fully understand the requirements that
+doing so will impose on you and the kernel community.
+
+The different lists have different goals and the linux-distros rules do
+not contribute to actually fixing any potential security problems.
CVE assignment
--------------
-The security team does not normally assign CVEs, nor do we require them
-for reports or fixes, as this can needlessly complicate the process and
-may delay the bug handling. If a reporter wishes to have a CVE identifier
-assigned ahead of public disclosure, they will need to contact the private
-linux-distros list, described above. When such a CVE identifier is known
-before a patch is provided, it is desirable to mention it in the commit
-message if the reporter agrees.
+The security team does not assign CVEs, nor do we require them for
+reports or fixes, as this can needlessly complicate the process and may
+delay the bug handling. If a reporter wishes to have a CVE identifier
+assigned, they should find one by themselves, for example by contacting
+MITRE directly. However under no circumstances will a patch inclusion
+be delayed to wait for a CVE identifier to arrive.
Non-disclosure agreements
-------------------------
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index 7b477fa19534..568c24ff00a7 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -557,6 +557,15 @@ numa_balancing_scan_size_mb is how many megabytes worth of pages are
scanned for a given scan.
+oops_limit
+==========
+
+Number of kernel oopses after which the kernel should panic when
+``panic_on_oops`` is not set. Setting this to 0 disables checking
+the count. Setting this to 1 has the same effect as setting
+``panic_on_oops=1``. The default value is 10000.
+
+
osrelease, ostype & version:
============================
@@ -862,9 +871,40 @@ The kernel command line parameter printk.devkmsg= overrides this and is
a one-time setting until next reboot: once set, it cannot be changed by
this sysctl interface anymore.
+pty
+===
-randomize_va_space:
-===================
+See Documentation/filesystems/devpts.rst.
+
+
+random
+======
+
+This is a directory, with the following entries:
+
+* ``boot_id``: a UUID generated the first time this is retrieved, and
+ unvarying after that;
+
+* ``uuid``: a UUID generated every time this is retrieved (this can
+ thus be used to generate UUIDs at will);
+
+* ``entropy_avail``: the pool's entropy count, in bits;
+
+* ``poolsize``: the entropy pool size, in bits;
+
+* ``urandom_min_reseed_secs``: obsolete (used to determine the minimum
+ number of seconds between urandom pool reseeding). This file is
+ writable for compatibility purposes, but writing to it has no effect
+ on any RNG behavior;
+
+* ``write_wakeup_threshold``: when the entropy count drops below this
+ (as a number of bits), processes waiting to write to ``/dev/random``
+ are woken up. This file is writable for compatibility purposes, but
+ writing to it has no effect on any RNG behavior.
+
+
+randomize_va_space
+==================
This option can be used to select the type of process address
space randomization that is used in the system, for architectures
@@ -1146,6 +1186,16 @@ entry will default to 2 instead of 0.
2 Unprivileged calls to ``bpf()`` are disabled
= =============================================================
+
+warn_limit
+==========
+
+Number of kernel warnings after which the kernel should panic when
+``panic_on_warn`` is not set. Setting this to 0 disables checking
+the warning count. Setting this to 1 has the same effect as setting
+``panic_on_warn=1``. The default value is 0.
+
+
watchdog:
=========
diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst
index 287b98708a40..70bab788fca3 100644
--- a/Documentation/admin-guide/sysctl/net.rst
+++ b/Documentation/admin-guide/sysctl/net.rst
@@ -31,17 +31,18 @@ see only some of them, depending on your kernel's configuration.
Table : Subdirectories in /proc/sys/net
- ========= =================== = ========== ==================
+ ========= =================== = ========== ===================
Directory Content Directory Content
- ========= =================== = ========== ==================
- core General parameter appletalk Appletalk protocol
- unix Unix domain sockets netrom NET/ROM
- 802 E802 protocol ax25 AX25
- ethernet Ethernet protocol rose X.25 PLP layer
+ ========= =================== = ========== ===================
+ 802 E802 protocol mptcp Multipath TCP
+ appletalk Appletalk protocol netfilter Network Filter
+ ax25 AX25 netrom NET/ROM
+ bridge Bridging rose X.25 PLP layer
+ core General parameter tipc TIPC
+ ethernet Ethernet protocol unix Unix domain sockets
ipv4 IP version 4 x25 X.25 protocol
- bridge Bridging decnet DEC net
- ipv6 IP version 6 tipc TIPC
- ========= =================== = ========== ==================
+ ipv6 IP version 6
+ ========= =================== = ========== ===================
1. /proc/sys/net/core - Network core options
============================================
diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index 64aeee1009ca..fdc9c99437d1 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -61,6 +61,7 @@ Currently, these files are in /proc/sys/vm:
- overcommit_memory
- overcommit_ratio
- page-cluster
+- page_lock_unfairness
- panic_on_oom
- percpu_pagelist_fraction
- stat_interval
@@ -741,6 +742,14 @@ extra faults and I/O delays for following faults if they would have been part of
that consecutive pages readahead would have brought in.
+page_lock_unfairness
+====================
+
+This value determines the number of times that the page lock can be
+stolen from under a waiter. After the lock is stolen the number of times
+specified in this file (default is 5), the "fair lock handoff" semantics
+will apply, and the waiter will only be awakened if the lock can be taken.
+
panic_on_oom
============
diff --git a/Documentation/arm64/silicon-errata.rst b/Documentation/arm64/silicon-errata.rst
index 59daa4c21816..6b70b6aabcff 100644
--- a/Documentation/arm64/silicon-errata.rst
+++ b/Documentation/arm64/silicon-errata.rst
@@ -70,8 +70,12 @@ stable kernels.
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A57 | #834220 | ARM64_ERRATUM_834220 |
+----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A57 | #1742098 | ARM64_ERRATUM_1742098 |
++----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A72 | #853709 | N/A |
+----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A72 | #1655431 | ARM64_ERRATUM_1742098 |
++----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A73 | #858921 | ARM64_ERRATUM_858921 |
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A55 | #1024718 | ARM64_ERRATUM_1024718 |
@@ -130,6 +134,9 @@ stable kernels.
+----------------+-----------------+-----------------+-----------------------------+
| Hisilicon | Hip08 SMMU PMCG | #162001800 | N/A |
+----------------+-----------------+-----------------+-----------------------------+
+| Hisilicon | Hip08 SMMU PMCG | #162001900 | N/A |
+| | Hip09 SMMU PMCG | | |
++----------------+-----------------+-----------------+-----------------------------+
+----------------+-----------------+-----------------+-----------------------------+
| Qualcomm Tech. | Kryo/Falkor v1 | E1003 | QCOM_FALKOR_ERRATUM_1003 |
+----------------+-----------------+-----------------+-----------------------------+
diff --git a/Documentation/atomic_bitops.txt b/Documentation/atomic_bitops.txt
index 093cdaefdb37..d8b101c97031 100644
--- a/Documentation/atomic_bitops.txt
+++ b/Documentation/atomic_bitops.txt
@@ -59,7 +59,7 @@ Like with atomic_t, the rule of thumb is:
- RMW operations that have a return value are fully ordered.
- RMW operations that are conditional are unordered on FAILURE,
- otherwise the above rules apply. In the case of test_and_{}_bit() operations,
+ otherwise the above rules apply. In the case of test_and_set_bit_lock(),
if the bit in memory is unchanged by the operation then it is deemed to have
failed.
diff --git a/Documentation/conf.py b/Documentation/conf.py
index a8fe845832bc..38c1f7618b5e 100644
--- a/Documentation/conf.py
+++ b/Documentation/conf.py
@@ -98,7 +98,7 @@ finally:
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
-language = None
+language = 'en'
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
diff --git a/Documentation/dev-tools/gdb-kernel-debugging.rst b/Documentation/dev-tools/gdb-kernel-debugging.rst
index 19df79286f00..afe4bc206486 100644
--- a/Documentation/dev-tools/gdb-kernel-debugging.rst
+++ b/Documentation/dev-tools/gdb-kernel-debugging.rst
@@ -39,6 +39,10 @@ Setup
this mode. In this case, you should build the kernel with
CONFIG_RANDOMIZE_BASE disabled if the architecture supports KASLR.
+- Build the gdb scripts (required on kernels v5.1 and above)::
+
+ make scripts_gdb
+
- Enable the gdb stub of QEMU/KVM, either
- at VM startup time by appending "-s" to the QEMU command line
diff --git a/Documentation/devicetree/bindings/arm/qcom.yaml b/Documentation/devicetree/bindings/arm/qcom.yaml
index e39d8f02e33c..3de6182cde97 100644
--- a/Documentation/devicetree/bindings/arm/qcom.yaml
+++ b/Documentation/devicetree/bindings/arm/qcom.yaml
@@ -112,8 +112,8 @@ properties:
- const: qcom,msm8974
- items:
- - const: qcom,msm8916-mtp/1
- const: qcom,msm8916-mtp
+ - const: qcom,msm8916-mtp/1
- const: qcom,msm8916
- items:
diff --git a/Documentation/devicetree/bindings/dma/allwinner,sun50i-a64-dma.yaml b/Documentation/devicetree/bindings/dma/allwinner,sun50i-a64-dma.yaml
index 4cb9d6b93138..c61c4a6b57f1 100644
--- a/Documentation/devicetree/bindings/dma/allwinner,sun50i-a64-dma.yaml
+++ b/Documentation/devicetree/bindings/dma/allwinner,sun50i-a64-dma.yaml
@@ -58,7 +58,7 @@ if:
then:
properties:
clocks:
- maxItems: 2
+ minItems: 2
required:
- clock-names
diff --git a/Documentation/devicetree/bindings/dma/moxa,moxart-dma.txt b/Documentation/devicetree/bindings/dma/moxa,moxart-dma.txt
index 8a9f3559335b..7e14e26676ec 100644
--- a/Documentation/devicetree/bindings/dma/moxa,moxart-dma.txt
+++ b/Documentation/devicetree/bindings/dma/moxa,moxart-dma.txt
@@ -34,8 +34,8 @@ Example:
Use specific request line passing from dma
For example, MMC request line is 5
- sdhci: sdhci@98e00000 {
- compatible = "moxa,moxart-sdhci";
+ mmc: mmc@98e00000 {
+ compatible = "moxa,moxart-mmc";
reg = <0x98e00000 0x5C>;
interrupts = <5 0>;
clocks = <&clk_apb>;
diff --git a/Documentation/devicetree/bindings/gpio/gpio-altera.txt b/Documentation/devicetree/bindings/gpio/gpio-altera.txt
index 146e554b3c67..2a80e272cd66 100644
--- a/Documentation/devicetree/bindings/gpio/gpio-altera.txt
+++ b/Documentation/devicetree/bindings/gpio/gpio-altera.txt
@@ -9,8 +9,9 @@ Required properties:
- The second cell is reserved and is currently unused.
- gpio-controller : Marks the device node as a GPIO controller.
- interrupt-controller: Mark the device node as an interrupt controller
-- #interrupt-cells : Should be 1. The interrupt type is fixed in the hardware.
+- #interrupt-cells : Should be 2. The interrupt type is fixed in the hardware.
- The first cell is the GPIO offset number within the GPIO controller.
+ - The second cell is the interrupt trigger type and level flags.
- interrupts: Specify the interrupt.
- altr,interrupt-type: Specifies the interrupt trigger type the GPIO
hardware is synthesized. This field is required if the Altera GPIO controller
@@ -38,6 +39,6 @@ gpio_altr: gpio@ff200000 {
altr,interrupt-type = <IRQ_TYPE_EDGE_RISING>;
#gpio-cells = <2>;
gpio-controller;
- #interrupt-cells = <1>;
+ #interrupt-cells = <2>;
interrupt-controller;
};
diff --git a/Documentation/devicetree/bindings/interrupt-controller/ti,sci-intr.txt b/Documentation/devicetree/bindings/interrupt-controller/ti,sci-intr.txt
index 1a8718f8855d..178fca08278f 100644
--- a/Documentation/devicetree/bindings/interrupt-controller/ti,sci-intr.txt
+++ b/Documentation/devicetree/bindings/interrupt-controller/ti,sci-intr.txt
@@ -55,7 +55,7 @@ Required Properties:
corresponds to a range of host irqs.
For more details on TISCI IRQ resource management refer:
-http://downloads.ti.com/tisci/esd/latest/2_tisci_msgs/rm/rm_irq.html
+https://downloads.ti.com/tisci/esd/latest/2_tisci_msgs/rm/rm_irq.html
Example:
--------
diff --git a/Documentation/devicetree/bindings/phy/amlogic,meson-g12a-usb3-pcie-phy.yaml b/Documentation/devicetree/bindings/phy/amlogic,g12a-usb3-pcie-phy.yaml
index 346f9c35427c..5407468a8dda 100644
--- a/Documentation/devicetree/bindings/phy/amlogic,meson-g12a-usb3-pcie-phy.yaml
+++ b/Documentation/devicetree/bindings/phy/amlogic,g12a-usb3-pcie-phy.yaml
@@ -2,7 +2,7 @@
# Copyright 2019 BayLibre, SAS
%YAML 1.2
---
-$id: "http://devicetree.org/schemas/phy/amlogic,meson-g12a-usb3-pcie-phy.yaml#"
+$id: "http://devicetree.org/schemas/phy/amlogic,g12a-usb3-pcie-phy.yaml#"
$schema: "http://devicetree.org/meta-schemas/core.yaml#"
title: Amlogic G12A USB3 + PCIE Combo PHY
@@ -13,7 +13,7 @@ maintainers:
properties:
compatible:
enum:
- - amlogic,meson-g12a-usb3-pcie-phy
+ - amlogic,g12a-usb3-pcie-phy
reg:
maxItems: 1
@@ -47,7 +47,7 @@ required:
examples:
- |
phy@46000 {
- compatible = "amlogic,meson-g12a-usb3-pcie-phy";
+ compatible = "amlogic,g12a-usb3-pcie-phy";
reg = <0x46000 0x2000>;
clocks = <&ref_clk>;
clock-names = "ref_clk";
diff --git a/Documentation/devicetree/bindings/rtc/allwinner,sun6i-a31-rtc.yaml b/Documentation/devicetree/bindings/rtc/allwinner,sun6i-a31-rtc.yaml
index d7a57ec4a640..140793050df3 100644
--- a/Documentation/devicetree/bindings/rtc/allwinner,sun6i-a31-rtc.yaml
+++ b/Documentation/devicetree/bindings/rtc/allwinner,sun6i-a31-rtc.yaml
@@ -128,7 +128,6 @@ required:
- compatible
- reg
- interrupts
- - clocks
- clock-output-names
additionalProperties: false
diff --git a/Documentation/devicetree/bindings/sound/qcom,wcd9335.txt b/Documentation/devicetree/bindings/sound/qcom,wcd9335.txt
index 5d6ea66a863f..1f75feec3dec 100644
--- a/Documentation/devicetree/bindings/sound/qcom,wcd9335.txt
+++ b/Documentation/devicetree/bindings/sound/qcom,wcd9335.txt
@@ -109,7 +109,7 @@ audio-codec@1{
reg = <1 0>;
interrupts = <&msmgpio 54 IRQ_TYPE_LEVEL_HIGH>;
interrupt-names = "intr2"
- reset-gpios = <&msmgpio 64 0>;
+ reset-gpios = <&msmgpio 64 GPIO_ACTIVE_LOW>;
slim-ifc-dev = <&wc9335_ifd>;
clock-names = "mclk", "native";
clocks = <&rpmcc RPM_SMD_DIV_CLK1>,
diff --git a/Documentation/driver-api/spi.rst b/Documentation/driver-api/spi.rst
index f64cb666498a..f28887045049 100644
--- a/Documentation/driver-api/spi.rst
+++ b/Documentation/driver-api/spi.rst
@@ -25,8 +25,8 @@ hardware, which may be as simple as a set of GPIO pins or as complex as
a pair of FIFOs connected to dual DMA engines on the other side of the
SPI shift register (maximizing throughput). Such drivers bridge between
whatever bus they sit on (often the platform bus) and SPI, and expose
-the SPI side of their device as a :c:type:`struct spi_master
-<spi_master>`. SPI devices are children of that master,
+the SPI side of their device as a :c:type:`struct spi_controller
+<spi_controller>`. SPI devices are children of that master,
represented as a :c:type:`struct spi_device <spi_device>` and
manufactured from :c:type:`struct spi_board_info
<spi_board_info>` descriptors which are usually provided by
diff --git a/Documentation/fault-injection/fault-injection.rst b/Documentation/fault-injection/fault-injection.rst
index f51bb21d20e4..49b577307385 100644
--- a/Documentation/fault-injection/fault-injection.rst
+++ b/Documentation/fault-injection/fault-injection.rst
@@ -74,8 +74,8 @@ configuration of fault-injection capabilities.
- /sys/kernel/debug/fail*/times:
- specifies how many times failures may happen at most.
- A value of -1 means "no limit".
+ specifies how many times failures may happen at most. A value of -1
+ means "no limit".
- /sys/kernel/debug/fail*/space:
@@ -163,11 +163,13 @@ configuration of fault-injection capabilities.
- ERRNO: retval must be -1 to -MAX_ERRNO (-4096).
- ERR_NULL: retval must be 0 or -1 to -MAX_ERRNO (-4096).
-- /sys/kernel/debug/fail_function/<functiuon-name>/retval:
+- /sys/kernel/debug/fail_function/<function-name>/retval:
- specifies the "error" return value to inject to the given
- function for given function. This will be created when
- user specifies new injection entry.
+ specifies the "error" return value to inject to the given function.
+ This will be created when the user specifies a new injection entry.
+ Note that this file only accepts unsigned values. So, if you want to
+ use a negative errno, you better use 'printf' instead of 'echo', e.g.:
+ $ printf %#x -12 > retval
Boot option
^^^^^^^^^^^
@@ -331,7 +333,7 @@ Application Examples
FAILTYPE=fail_function
FAILFUNC=open_ctree
echo $FAILFUNC > /sys/kernel/debug/$FAILTYPE/inject
- echo -12 > /sys/kernel/debug/$FAILTYPE/$FAILFUNC/retval
+ printf %#x -12 > /sys/kernel/debug/$FAILTYPE/$FAILFUNC/retval
echo N > /sys/kernel/debug/$FAILTYPE/task-filter
echo 100 > /sys/kernel/debug/$FAILTYPE/probability
echo 0 > /sys/kernel/debug/$FAILTYPE/interval
diff --git a/Documentation/filesystems/directory-locking.rst b/Documentation/filesystems/directory-locking.rst
index de12016ee419..6a238477f27f 100644
--- a/Documentation/filesystems/directory-locking.rst
+++ b/Documentation/filesystems/directory-locking.rst
@@ -23,13 +23,15 @@ exclusive.
locks victim and calls the method. Locks are exclusive.
4) rename() that is _not_ cross-directory. Locking rules: caller locks
-the parent and finds source and target. In case of exchange (with
-RENAME_EXCHANGE in flags argument) lock both. In any case,
-if the target already exists, lock it. If the source is a non-directory,
-lock it. If we need to lock both, lock them in inode pointer order.
-Then call the method. All locks are exclusive.
-NB: we might get away with locking the the source (and target in exchange
-case) shared.
+the parent and finds source and target. Then we decide which of the
+source and target need to be locked. Source needs to be locked if it's a
+non-directory; target - if it's a non-directory or about to be removed.
+Take the locks that need to be taken, in inode pointer order if need
+to take both (that can happen only when both source and target are
+non-directories - the source because it wouldn't be locked otherwise
+and the target because mixing directory and non-directory is allowed
+only with RENAME_EXCHANGE, and that won't be removing the target).
+After the locks had been taken, call the method. All locks are exclusive.
5) link creation. Locking rules:
@@ -44,19 +46,22 @@ All locks are exclusive.
rules:
* lock the filesystem
- * lock parents in "ancestors first" order.
+ * lock parents in "ancestors first" order. If one is not ancestor of
+ the other, lock the parent of source first.
* find source and target.
* if old parent is equal to or is a descendent of target
fail with -ENOTEMPTY
* if new parent is equal to or is a descendent of source
fail with -ELOOP
- * If it's an exchange, lock both the source and the target.
- * If the target exists, lock it. If the source is a non-directory,
- lock it. If we need to lock both, do so in inode pointer order.
+ * Lock both the source and the target provided they exist. If we
+ need to lock two inodes of different type (dir vs non-dir), we lock
+ the directory first. If we need to lock two inodes of the same type,
+ lock them in inode pointer order.
+ * Lock subdirectories involved (source before target).
+ * Lock non-directories involved, in inode pointer order.
* call the method.
-All ->i_rwsem are taken exclusive. Again, we might get away with locking
-the the source (and target in exchange case) shared.
+All ->i_rwsem are taken exclusive.
The rules above obviously guarantee that all directories that are going to be
read, modified or removed by method will be locked by caller.
@@ -66,8 +71,10 @@ If no directory is its own ancestor, the scheme above is deadlock-free.
Proof:
- First of all, at any moment we have a partial ordering of the
- objects - A < B iff A is an ancestor of B.
+[XXX: will be updated once we are done massaging the lock_rename()]
+ First of all, at any moment we have a linear ordering of the
+ objects - A < B iff (A is an ancestor of B) or (B is not an ancestor
+ of A and ptr(A) < ptr(B)).
That ordering can change. However, the following is true:
diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index fc3a0704553c..b8b2906dc221 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -95,7 +95,7 @@ symlink: exclusive
mkdir: exclusive
unlink: exclusive (both)
rmdir: exclusive (both)(see below)
-rename: exclusive (all) (see below)
+rename: exclusive (both parents, some children) (see below)
readlink: no
get_link: no
setattr: exclusive
@@ -113,6 +113,9 @@ tmpfile: no
Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_rwsem
exclusive on victim.
cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem.
+ ->unlink() and ->rename() have ->i_rwsem exclusive on all non-directories
+ involved.
+ ->rename() has ->i_rwsem exclusive on any subdirectory that changes parent.
See Documentation/filesystems/directory-locking.rst for more detailed discussion
of the locking scheme for directory operations.
diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 26c093969573..48301b6b517c 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -858,3 +858,21 @@ be misspelled d_alloc_anon().
[should've been added in 2016] stale comment in finish_open() nonwithstanding,
failure exits in ->atomic_open() instances should *NOT* fput() the file,
no matter what. Everything is handled by the caller.
+
+---
+
+**mandatory**
+
+If ->rename() update of .. on cross-directory move needs an exclusion with
+directory modifications, do *not* lock the subdirectory in question in your
+->rename() - it's done by the caller now [that item should've been added in
+28eceeda130f "fs: Lock moved directories"].
+
+---
+
+**mandatory**
+
+On same-directory ->rename() the (tautological) update of .. is not protected
+by any locks; just don't do it if the old parent is the same as the new one.
+We really can't lock two subdirectories in same-directory rename - not without
+deadlocks.
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index 7d4d09dd5e6d..241e31200643 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -1173,7 +1173,7 @@ defined:
return
-ECHILD and it will be called again in ref-walk mode.
-``_weak_revalidate``
+``d_weak_revalidate``
called when the VFS needs to revalidate a "jumped" dentry. This
is called when a path-walk ends at dentry that was not acquired
by doing a lookup in the parent directory. This includes "/",
diff --git a/Documentation/firmware-guide/acpi/apei/einj.rst b/Documentation/firmware-guide/acpi/apei/einj.rst
index e588bccf5158..344284236a81 100644
--- a/Documentation/firmware-guide/acpi/apei/einj.rst
+++ b/Documentation/firmware-guide/acpi/apei/einj.rst
@@ -168,7 +168,7 @@ An error injection example::
0x00000008 Memory Correctable
0x00000010 Memory Uncorrectable non-fatal
# echo 0x12345000 > param1 # Set memory address for injection
- # echo $((-1 << 12)) > param2 # Mask 0xfffffffffffff000 - anywhere in this page
+ # echo 0xfffffffffffff000 > param2 # Mask - anywhere in this page
# echo 0x8 > error_type # Choose correctable memory error
# echo 1 > error_inject # Inject now
diff --git a/Documentation/input/joydev/joystick.rst b/Documentation/input/joydev/joystick.rst
index 9746fd76cc58..f38c330c028e 100644
--- a/Documentation/input/joydev/joystick.rst
+++ b/Documentation/input/joydev/joystick.rst
@@ -517,6 +517,7 @@ All I-Force devices are supported by the iforce module. This includes:
* AVB Mag Turbo Force
* AVB Top Shot Pegasus
* AVB Top Shot Force Feedback Racing Wheel
+* Boeder Force Feedback Wheel
* Logitech WingMan Force
* Logitech WingMan Force Wheel
* Guillemot Race Leader Force Feedback
diff --git a/Documentation/ioctl/ioctl-number.rst b/Documentation/ioctl/ioctl-number.rst
index bef79cd4c6b4..6246a0d4f323 100644
--- a/Documentation/ioctl/ioctl-number.rst
+++ b/Documentation/ioctl/ioctl-number.rst
@@ -301,7 +301,6 @@ Code Seq# Include File Comments
0x89 00-06 arch/x86/include/asm/sockios.h
0x89 0B-DF linux/sockios.h
0x89 E0-EF linux/sockios.h SIOCPROTOPRIVATE range
-0x89 E0-EF linux/dn.h PROTOPRIVATE range
0x89 F0-FF linux/sockios.h SIOCDEVPRIVATE range
0x8B all linux/wireless.h
0x8C 00-3F WiNRADiO driver
diff --git a/Documentation/media/kapi/v4l2-dev.rst b/Documentation/media/kapi/v4l2-dev.rst
index 4c5a15c53dbf..63c064837c00 100644
--- a/Documentation/media/kapi/v4l2-dev.rst
+++ b/Documentation/media/kapi/v4l2-dev.rst
@@ -185,7 +185,7 @@ This will create the character device for you.
.. code-block:: c
- err = video_register_device(vdev, VFL_TYPE_GRABBER, -1);
+ err = video_register_device(vdev, VFL_TYPE_VIDEO, -1);
if (err) {
video_device_release(vdev); /* or kfree(my_vdev); */
return err;
@@ -201,7 +201,7 @@ types exist:
========================== ==================== ==============================
:c:type:`vfl_devnode_type` Device name Usage
========================== ==================== ==============================
-``VFL_TYPE_GRABBER`` ``/dev/videoX`` for video input/output devices
+``VFL_TYPE_VIDEO`` ``/dev/videoX`` for video input/output devices
``VFL_TYPE_VBI`` ``/dev/vbiX`` for vertical blank data (i.e.
closed captions, teletext)
``VFL_TYPE_RADIO`` ``/dev/radioX`` for radio tuners
diff --git a/Documentation/media/uapi/v4l/subdev-formats.rst b/Documentation/media/uapi/v4l/subdev-formats.rst
index 15e11f27b4c8..b89a2f6c9155 100644
--- a/Documentation/media/uapi/v4l/subdev-formats.rst
+++ b/Documentation/media/uapi/v4l/subdev-formats.rst
@@ -7794,3 +7794,30 @@ formats.
- 0x5001
- Interleaved raw UYVY and JPEG image format with embedded meta-data
used by Samsung S3C73MX camera sensors.
+
+.. _v4l2-mbus-metadata-fmts:
+
+Metadata Formats
+^^^^^^^^^^^^^^^^
+
+This section lists all metadata formats.
+
+The following table lists the existing metadata formats.
+
+.. tabularcolumns:: |p{8.0cm}|p{1.4cm}|p{7.7cm}|
+
+.. flat-table:: Metadata formats
+ :header-rows: 1
+ :stub-columns: 0
+
+ * - Identifier
+ - Code
+ - Comments
+ * .. _MEDIA-BUS-FMT-METADATA-FIXED:
+
+ - MEDIA_BUS_FMT_METADATA_FIXED
+ - 0x7001
+ - This format should be used when the same driver handles
+ both sides of the link and the bus format is a fixed
+ metadata format that is not configurable from userspace.
+ Width and height will be set to 0 for this format.
diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst
index 83f7ae5fc045..09b3943b3b71 100644
--- a/Documentation/networking/af_xdp.rst
+++ b/Documentation/networking/af_xdp.rst
@@ -40,13 +40,13 @@ allocates memory for this UMEM using whatever means it feels is most
appropriate (malloc, mmap, huge pages, etc). This memory area is then
registered with the kernel using the new setsockopt XDP_UMEM_REG. The
UMEM also has two rings: the FILL ring and the COMPLETION ring. The
-fill ring is used by the application to send down addr for the kernel
+FILL ring is used by the application to send down addr for the kernel
to fill in with RX packet data. References to these frames will then
appear in the RX ring once each packet has been received. The
-completion ring, on the other hand, contains frame addr that the
+COMPLETION ring, on the other hand, contains frame addr that the
kernel has transmitted completely and can now be used again by user
space, for either TX or RX. Thus, the frame addrs appearing in the
-completion ring are addrs that were previously transmitted using the
+COMPLETION ring are addrs that were previously transmitted using the
TX ring. In summary, the RX and FILL rings are used for the RX path
and the TX and COMPLETION rings are used for the TX path.
@@ -91,11 +91,16 @@ Concepts
========
In order to use an AF_XDP socket, a number of associated objects need
-to be setup.
+to be setup. These objects and their options are explained in the
+following sections.
-Jonathan Corbet has also written an excellent article on LWN,
-"Accelerating networking with AF_XDP". It can be found at
-https://lwn.net/Articles/750845/.
+For an overview on how AF_XDP works, you can also take a look at the
+Linux Plumbers paper from 2018 on the subject:
+http://vger.kernel.org/lpc_net2018_talks/lpc18_paper_af_xdp_perf-v2.pdf. Do
+NOT consult the paper from 2017 on "AF_PACKET v4", the first attempt
+at AF_XDP. Nearly everything changed since then. Jonathan Corbet has
+also written an excellent article on LWN, "Accelerating networking
+with AF_XDP". It can be found at https://lwn.net/Articles/750845/.
UMEM
----
@@ -113,22 +118,22 @@ the next socket B can do this by setting the XDP_SHARED_UMEM flag in
struct sockaddr_xdp member sxdp_flags, and passing the file descriptor
of A to struct sockaddr_xdp member sxdp_shared_umem_fd.
-The UMEM has two single-producer/single-consumer rings, that are used
+The UMEM has two single-producer/single-consumer rings that are used
to transfer ownership of UMEM frames between the kernel and the
user-space application.
Rings
-----
-There are a four different kind of rings: Fill, Completion, RX and
+There are a four different kind of rings: FILL, COMPLETION, RX and
TX. All rings are single-producer/single-consumer, so the user-space
application need explicit synchronization of multiple
processes/threads are reading/writing to them.
-The UMEM uses two rings: Fill and Completion. Each socket associated
+The UMEM uses two rings: FILL and COMPLETION. Each socket associated
with the UMEM must have an RX queue, TX queue or both. Say, that there
is a setup with four sockets (all doing TX and RX). Then there will be
-one Fill ring, one Completion ring, four TX rings and four RX rings.
+one FILL ring, one COMPLETION ring, four TX rings and four RX rings.
The rings are head(producer)/tail(consumer) based rings. A producer
writes the data ring at the index pointed out by struct xdp_ring
@@ -146,7 +151,7 @@ The size of the rings need to be of size power of two.
UMEM Fill Ring
~~~~~~~~~~~~~~
-The Fill ring is used to transfer ownership of UMEM frames from
+The FILL ring is used to transfer ownership of UMEM frames from
user-space to kernel-space. The UMEM addrs are passed in the ring. As
an example, if the UMEM is 64k and each chunk is 4k, then the UMEM has
16 chunks and can pass addrs between 0 and 64k.
@@ -164,8 +169,8 @@ chunks mode, then the incoming addr will be left untouched.
UMEM Completion Ring
~~~~~~~~~~~~~~~~~~~~
-The Completion Ring is used transfer ownership of UMEM frames from
-kernel-space to user-space. Just like the Fill ring, UMEM indicies are
+The COMPLETION Ring is used transfer ownership of UMEM frames from
+kernel-space to user-space. Just like the FILL ring, UMEM indices are
used.
Frames passed from the kernel to user-space are frames that has been
@@ -181,7 +186,7 @@ The RX ring is the receiving side of a socket. Each entry in the ring
is a struct xdp_desc descriptor. The descriptor contains UMEM offset
(addr) and the length of the data (len).
-If no frames have been passed to kernel via the Fill ring, no
+If no frames have been passed to kernel via the FILL ring, no
descriptors will (or can) appear on the RX ring.
The user application consumes struct xdp_desc descriptors from this
@@ -199,8 +204,24 @@ be relaxed in the future.
The user application produces struct xdp_desc descriptors to this
ring.
+Libbpf
+======
+
+Libbpf is a helper library for eBPF and XDP that makes using these
+technologies a lot simpler. It also contains specific helper functions
+in tools/lib/bpf/xsk.h for facilitating the use of AF_XDP. It
+contains two types of functions: those that can be used to make the
+setup of AF_XDP socket easier and ones that can be used in the data
+plane to access the rings safely and quickly. To see an example on how
+to use this API, please take a look at the sample application in
+samples/bpf/xdpsock_usr.c which uses libbpf for both setup and data
+plane operations.
+
+We recommend that you use this library unless you have become a power
+user. It will make your program a lot simpler.
+
XSKMAP / BPF_MAP_TYPE_XSKMAP
-----------------------------
+============================
On XDP side there is a BPF map type BPF_MAP_TYPE_XSKMAP (XSKMAP) that
is used in conjunction with bpf_redirect_map() to pass the ingress
@@ -216,21 +237,193 @@ queue 17. Only the XDP program executing for eth0 and queue 17 will
successfully pass data to the socket. Please refer to the sample
application (samples/bpf/) in for an example.
+Configuration Flags and Socket Options
+======================================
+
+These are the various configuration flags that can be used to control
+and monitor the behavior of AF_XDP sockets.
+
+XDP_COPY and XDP_ZERO_COPY bind flags
+-------------------------------------
+
+When you bind to a socket, the kernel will first try to use zero-copy
+copy. If zero-copy is not supported, it will fall back on using copy
+mode, i.e. copying all packets out to user space. But if you would
+like to force a certain mode, you can use the following flags. If you
+pass the XDP_COPY flag to the bind call, the kernel will force the
+socket into copy mode. If it cannot use copy mode, the bind call will
+fail with an error. Conversely, the XDP_ZERO_COPY flag will force the
+socket into zero-copy mode or fail.
+
+XDP_SHARED_UMEM bind flag
+-------------------------
+
+This flag enables you to bind multiple sockets to the same UMEM, but
+only if they share the same queue id. In this mode, each socket has
+their own RX and TX rings, but the UMEM (tied to the fist socket
+created) only has a single FILL ring and a single COMPLETION
+ring. To use this mode, create the first socket and bind it in the normal
+way. Create a second socket and create an RX and a TX ring, or at
+least one of them, but no FILL or COMPLETION rings as the ones from
+the first socket will be used. In the bind call, set he
+XDP_SHARED_UMEM option and provide the initial socket's fd in the
+sxdp_shared_umem_fd field. You can attach an arbitrary number of extra
+sockets this way.
+
+What socket will then a packet arrive on? This is decided by the XDP
+program. Put all the sockets in the XSK_MAP and just indicate which
+index in the array you would like to send each packet to. A simple
+round-robin example of distributing packets is shown below:
+
+.. code-block:: c
+
+ #include <linux/bpf.h>
+ #include "bpf_helpers.h"
+
+ #define MAX_SOCKS 16
+
+ struct {
+ __uint(type, BPF_MAP_TYPE_XSKMAP);
+ __uint(max_entries, MAX_SOCKS);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+ } xsks_map SEC(".maps");
+
+ static unsigned int rr;
+
+ SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
+ {
+ rr = (rr + 1) & (MAX_SOCKS - 1);
+
+ return bpf_redirect_map(&xsks_map, rr, 0);
+ }
+
+Note, that since there is only a single set of FILL and COMPLETION
+rings, and they are single producer, single consumer rings, you need
+to make sure that multiple processes or threads do not use these rings
+concurrently. There are no synchronization primitives in the
+libbpf code that protects multiple users at this point in time.
+
+XDP_USE_NEED_WAKEUP bind flag
+-----------------------------
+
+This option adds support for a new flag called need_wakeup that is
+present in the FILL ring and the TX ring, the rings for which user
+space is a producer. When this option is set in the bind call, the
+need_wakeup flag will be set if the kernel needs to be explicitly
+woken up by a syscall to continue processing packets. If the flag is
+zero, no syscall is needed.
+
+If the flag is set on the FILL ring, the application needs to call
+poll() to be able to continue to receive packets on the RX ring. This
+can happen, for example, when the kernel has detected that there are no
+more buffers on the FILL ring and no buffers left on the RX HW ring of
+the NIC. In this case, interrupts are turned off as the NIC cannot
+receive any packets (as there are no buffers to put them in), and the
+need_wakeup flag is set so that user space can put buffers on the
+FILL ring and then call poll() so that the kernel driver can put these
+buffers on the HW ring and start to receive packets.
+
+If the flag is set for the TX ring, it means that the application
+needs to explicitly notify the kernel to send any packets put on the
+TX ring. This can be accomplished either by a poll() call, as in the
+RX path, or by calling sendto().
+
+An example of how to use this flag can be found in
+samples/bpf/xdpsock_user.c. An example with the use of libbpf helpers
+would look like this for the TX path:
+
+.. code-block:: c
+
+ if (xsk_ring_prod__needs_wakeup(&my_tx_ring))
+ sendto(xsk_socket__fd(xsk_handle), NULL, 0, MSG_DONTWAIT, NULL, 0);
+
+I.e., only use the syscall if the flag is set.
+
+We recommend that you always enable this mode as it usually leads to
+better performance especially if you run the application and the
+driver on the same core, but also if you use different cores for the
+application and the kernel driver, as it reduces the number of
+syscalls needed for the TX path.
+
+XDP_{RX|TX|UMEM_FILL|UMEM_COMPLETION}_RING setsockopts
+------------------------------------------------------
+
+These setsockopts sets the number of descriptors that the RX, TX,
+FILL, and COMPLETION rings respectively should have. It is mandatory
+to set the size of at least one of the RX and TX rings. If you set
+both, you will be able to both receive and send traffic from your
+application, but if you only want to do one of them, you can save
+resources by only setting up one of them. Both the FILL ring and the
+COMPLETION ring are mandatory if you have a UMEM tied to your socket,
+which is the normal case. But if the XDP_SHARED_UMEM flag is used, any
+socket after the first one does not have a UMEM and should in that
+case not have any FILL or COMPLETION rings created.
+
+XDP_UMEM_REG setsockopt
+-----------------------
+
+This setsockopt registers a UMEM to a socket. This is the area that
+contain all the buffers that packet can recide in. The call takes a
+pointer to the beginning of this area and the size of it. Moreover, it
+also has parameter called chunk_size that is the size that the UMEM is
+divided into. It can only be 2K or 4K at the moment. If you have an
+UMEM area that is 128K and a chunk size of 2K, this means that you
+will be able to hold a maximum of 128K / 2K = 64 packets in your UMEM
+area and that your largest packet size can be 2K.
+
+There is also an option to set the headroom of each single buffer in
+the UMEM. If you set this to N bytes, it means that the packet will
+start N bytes into the buffer leaving the first N bytes for the
+application to use. The final option is the flags field, but it will
+be dealt with in separate sections for each UMEM flag.
+
+SO_BINDTODEVICE setsockopt
+--------------------------
+
+This is a generic SOL_SOCKET option that can be used to tie AF_XDP
+socket to a particular network interface. It is useful when a socket
+is created by a privileged process and passed to a non-privileged one.
+Once the option is set, kernel will refuse attempts to bind that socket
+to a different interface. Updating the value requires CAP_NET_RAW.
+
+XDP_STATISTICS getsockopt
+-------------------------
+
+Gets drop statistics of a socket that can be useful for debug
+purposes. The supported statistics are shown below:
+
+.. code-block:: c
+
+ struct xdp_statistics {
+ __u64 rx_dropped; /* Dropped for reasons other than invalid desc */
+ __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */
+ __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */
+ };
+
+XDP_OPTIONS getsockopt
+----------------------
+
+Gets options from an XDP socket. The only one supported so far is
+XDP_OPTIONS_ZEROCOPY which tells you if zero-copy is on or not.
+
Usage
=====
-In order to use AF_XDP sockets there are two parts needed. The
+In order to use AF_XDP sockets two parts are needed. The
user-space application and the XDP program. For a complete setup and
usage example, please refer to the sample application. The user-space
side is xdpsock_user.c and the XDP side is part of libbpf.
-The XDP code sample included in tools/lib/bpf/xsk.c is the following::
+The XDP code sample included in tools/lib/bpf/xsk.c is the following:
+
+.. code-block:: c
SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
{
int index = ctx->rx_queue_index;
- // A set entry here means that the correspnding queue_id
+ // A set entry here means that the corresponding queue_id
// has an active AF_XDP socket bound to it.
if (bpf_map_lookup_elem(&xsks_map, &index))
return bpf_redirect_map(&xsks_map, index, 0);
@@ -238,7 +431,10 @@ The XDP code sample included in tools/lib/bpf/xsk.c is the following::
return XDP_PASS;
}
-Naive ring dequeue and enqueue could look like this::
+A simple but not so performance ring dequeue and enqueue could look
+like this:
+
+.. code-block:: c
// struct xdp_rxtx_ring {
// __u32 *producer;
@@ -287,17 +483,16 @@ Naive ring dequeue and enqueue could look like this::
return 0;
}
-
-For a more optimized version, please refer to the sample application.
+But please use the libbpf functions as they are optimized and ready to
+use. Will make your life easier.
Sample application
==================
There is a xdpsock benchmarking/test application included that
-demonstrates how to use AF_XDP sockets with both private and shared
-UMEMs. Say that you would like your UDP traffic from port 4242 to end
-up in queue 16, that we will enable AF_XDP on. Here, we use ethtool
-for this::
+demonstrates how to use AF_XDP sockets with private UMEMs. Say that
+you would like your UDP traffic from port 4242 to end up in queue 16,
+that we will enable AF_XDP on. Here, we use ethtool for this::
ethtool -N p3p2 rx-flow-hash udp4 fn
ethtool -N p3p2 flow-type udp4 src-port 4242 dst-port 4242 \
@@ -311,13 +506,18 @@ using::
For XDP_SKB mode, use the switch "-S" instead of "-N" and all options
can be displayed with "-h", as usual.
+This sample application uses libbpf to make the setup and usage of
+AF_XDP simpler. If you want to know how the raw uapi of AF_XDP is
+really used to make something more advanced, take a look at the libbpf
+code in tools/lib/bpf/xsk.[ch].
+
FAQ
=======
Q: I am not seeing any traffic on the socket. What am I doing wrong?
A: When a netdev of a physical NIC is initialized, Linux usually
- allocates one Rx and Tx queue pair per core. So on a 8 core system,
+ allocates one RX and TX queue pair per core. So on a 8 core system,
queue ids 0 to 7 will be allocated, one per core. In the AF_XDP
bind call or the xsk_socket__create libbpf function call, you
specify a specific queue id to bind to and it is only the traffic
@@ -343,9 +543,21 @@ A: When a netdev of a physical NIC is initialized, Linux usually
sudo ethtool -N <interface> flow-type udp4 src-port 4242 dst-port \
4242 action 2
- A number of other ways are possible all up to the capabilitites of
+ A number of other ways are possible all up to the capabilities of
the NIC you have.
+Q: Can I use the XSKMAP to implement a switch betwen different umems
+ in copy mode?
+
+A: The short answer is no, that is not supported at the moment. The
+ XSKMAP can only be used to switch traffic coming in on queue id X
+ to sockets bound to the same queue id X. The XSKMAP can contain
+ sockets bound to different queue ids, for example X and Y, but only
+ traffic goming in from queue id Y can be directed to sockets bound
+ to the same queue id Y. In zero-copy mode, you should use the
+ switch, or other distribution mechanism, in your NIC to direct
+ traffic to the correct queue id and socket.
+
Credits
=======
diff --git a/Documentation/networking/decnet.txt b/Documentation/networking/decnet.txt
deleted file mode 100644
index d192f8b9948b..000000000000
--- a/Documentation/networking/decnet.txt
+++ /dev/null
@@ -1,230 +0,0 @@
- Linux DECnet Networking Layer Information
- ===========================================
-
-1) Other documentation....
-
- o Project Home Pages
- http://www.chygwyn.com/ - Kernel info
- http://linux-decnet.sourceforge.net/ - Userland tools
- http://www.sourceforge.net/projects/linux-decnet/ - Status page
-
-2) Configuring the kernel
-
-Be sure to turn on the following options:
-
- CONFIG_DECNET (obviously)
- CONFIG_PROC_FS (to see what's going on)
- CONFIG_SYSCTL (for easy configuration)
-
-if you want to try out router support (not properly debugged yet)
-you'll need the following options as well...
-
- CONFIG_DECNET_ROUTER (to be able to add/delete routes)
- CONFIG_NETFILTER (will be required for the DECnet routing daemon)
-
-Don't turn on SIOCGIFCONF support for DECnet unless you are really sure
-that you need it, in general you won't and it can cause ifconfig to
-malfunction.
-
-Run time configuration has changed slightly from the 2.4 system. If you
-want to configure an endnode, then the simplified procedure is as follows:
-
- o Set the MAC address on your ethernet card before starting _any_ other
- network protocols.
-
-As soon as your network card is brought into the UP state, DECnet should
-start working. If you need something more complicated or are unsure how
-to set the MAC address, see the next section. Also all configurations which
-worked with 2.4 will work under 2.5 with no change.
-
-3) Command line options
-
-You can set a DECnet address on the kernel command line for compatibility
-with the 2.4 configuration procedure, but in general it's not needed any more.
-If you do st a DECnet address on the command line, it has only one purpose
-which is that its added to the addresses on the loopback device.
-
-With 2.4 kernels, DECnet would only recognise addresses as local if they
-were added to the loopback device. In 2.5, any local interface address
-can be used to loop back to the local machine. Of course this does not
-prevent you adding further addresses to the loopback device if you
-want to.
-
-N.B. Since the address list of an interface determines the addresses for
-which "hello" messages are sent, if you don't set an address on the loopback
-interface then you won't see any entries in /proc/net/neigh for the local
-host until such time as you start a connection. This doesn't affect the
-operation of the local communications in any other way though.
-
-The kernel command line takes options looking like the following:
-
- decnet.addr=1,2
-
-the two numbers are the node address 1,2 = 1.2 For 2.2.xx kernels
-and early 2.3.xx kernels, you must use a comma when specifying the
-DECnet address like this. For more recent 2.3.xx kernels, you may
-use almost any character except space, although a `.` would be the most
-obvious choice :-)
-
-There used to be a third number specifying the node type. This option
-has gone away in favour of a per interface node type. This is now set
-using /proc/sys/net/decnet/conf/<dev>/forwarding. This file can be
-set with a single digit, 0=EndNode, 1=L1 Router and 2=L2 Router.
-
-There are also equivalent options for modules. The node address can
-also be set through the /proc/sys/net/decnet/ files, as can other system
-parameters.
-
-Currently the only supported devices are ethernet and ip_gre. The
-ethernet address of your ethernet card has to be set according to the DECnet
-address of the node in order for it to be autoconfigured (and then appear in
-/proc/net/decnet_dev). There is a utility available at the above
-FTP sites called dn2ethaddr which can compute the correct ethernet
-address to use. The address can be set by ifconfig either before or
-at the time the device is brought up. If you are using RedHat you can
-add the line:
-
- MACADDR=AA:00:04:00:03:04
-
-or something similar, to /etc/sysconfig/network-scripts/ifcfg-eth0 or
-wherever your network card's configuration lives. Setting the MAC address
-of your ethernet card to an address starting with "hi-ord" will cause a
-DECnet address which matches to be added to the interface (which you can
-verify with iproute2).
-
-The default device for routing can be set through the /proc filesystem
-by setting /proc/sys/net/decnet/default_device to the
-device you want DECnet to route packets out of when no specific route
-is available. Usually this will be eth0, for example:
-
- echo -n "eth0" >/proc/sys/net/decnet/default_device
-
-If you don't set the default device, then it will default to the first
-ethernet card which has been autoconfigured as described above. You can
-confirm that by looking in the default_device file of course.
-
-There is a list of what the other files under /proc/sys/net/decnet/ do
-on the kernel patch web site (shown above).
-
-4) Run time kernel configuration
-
-This is either done through the sysctl/proc interface (see the kernel web
-pages for details on what the various options do) or through the iproute2
-package in the same way as IPv4/6 configuration is performed.
-
-Documentation for iproute2 is included with the package, although there is
-as yet no specific section on DECnet, most of the features apply to both
-IP and DECnet, albeit with DECnet addresses instead of IP addresses and
-a reduced functionality.
-
-If you want to configure a DECnet router you'll need the iproute2 package
-since its the _only_ way to add and delete routes currently. Eventually
-there will be a routing daemon to send and receive routing messages for
-each interface and update the kernel routing tables accordingly. The
-routing daemon will use netfilter to listen to routing packets, and
-rtnetlink to update the kernels routing tables.
-
-The DECnet raw socket layer has been removed since it was there purely
-for use by the routing daemon which will now use netfilter (a much cleaner
-and more generic solution) instead.
-
-5) How can I tell if its working ?
-
-Here is a quick guide of what to look for in order to know if your DECnet
-kernel subsystem is working.
-
- - Is the node address set (see /proc/sys/net/decnet/node_address)
- - Is the node of the correct type
- (see /proc/sys/net/decnet/conf/<dev>/forwarding)
- - Is the Ethernet MAC address of each Ethernet card set to match
- the DECnet address. If in doubt use the dn2ethaddr utility available
- at the ftp archive.
- - If the previous two steps are satisfied, and the Ethernet card is up,
- you should find that it is listed in /proc/net/decnet_dev and also
- that it appears as a directory in /proc/sys/net/decnet/conf/. The
- loopback device (lo) should also appear and is required to communicate
- within a node.
- - If you have any DECnet routers on your network, they should appear
- in /proc/net/decnet_neigh, otherwise this file will only contain the
- entry for the node itself (if it doesn't check to see if lo is up).
- - If you want to send to any node which is not listed in the
- /proc/net/decnet_neigh file, you'll need to set the default device
- to point to an Ethernet card with connection to a router. This is
- again done with the /proc/sys/net/decnet/default_device file.
- - Try starting a simple server and client, like the dnping/dnmirror
- over the loopback interface. With luck they should communicate.
- For this step and those after, you'll need the DECnet library
- which can be obtained from the above ftp sites as well as the
- actual utilities themselves.
- - If this seems to work, then try talking to a node on your local
- network, and see if you can obtain the same results.
- - At this point you are on your own... :-)
-
-6) How to send a bug report
-
-If you've found a bug and want to report it, then there are several things
-you can do to help me work out exactly what it is that is wrong. Useful
-information (_most_ of which _is_ _essential_) includes:
-
- - What kernel version are you running ?
- - What version of the patch are you running ?
- - How far though the above set of tests can you get ?
- - What is in the /proc/decnet* files and /proc/sys/net/decnet/* files ?
- - Which services are you running ?
- - Which client caused the problem ?
- - How much data was being transferred ?
- - Was the network congested ?
- - How can the problem be reproduced ?
- - Can you use tcpdump to get a trace ? (N.B. Most (all?) versions of
- tcpdump don't understand how to dump DECnet properly, so including
- the hex listing of the packet contents is _essential_, usually the -x flag.
- You may also need to increase the length grabbed with the -s flag. The
- -e flag also provides very useful information (ethernet MAC addresses))
-
-7) MAC FAQ
-
-A quick FAQ on ethernet MAC addresses to explain how Linux and DECnet
-interact and how to get the best performance from your hardware.
-
-Ethernet cards are designed to normally only pass received network frames
-to a host computer when they are addressed to it, or to the broadcast address.
-
-Linux has an interface which allows the setting of extra addresses for
-an ethernet card to listen to. If the ethernet card supports it, the
-filtering operation will be done in hardware, if not the extra unwanted packets
-received will be discarded by the host computer. In the latter case,
-significant processor time and bus bandwidth can be used up on a busy
-network (see the NAPI documentation for a longer explanation of these
-effects).
-
-DECnet makes use of this interface to allow running DECnet on an ethernet
-card which has already been configured using TCP/IP (presumably using the
-built in MAC address of the card, as usual) and/or to allow multiple DECnet
-addresses on each physical interface. If you do this, be aware that if your
-ethernet card doesn't support perfect hashing in its MAC address filter
-then your computer will be doing more work than required. Some cards
-will simply set themselves into promiscuous mode in order to receive
-packets from the DECnet specified addresses. So if you have one of these
-cards its better to set the MAC address of the card as described above
-to gain the best efficiency. Better still is to use a card which supports
-NAPI as well.
-
-
-8) Mailing list
-
-If you are keen to get involved in development, or want to ask questions
-about configuration, or even just report bugs, then there is a mailing
-list that you can join, details are at:
-
-http://sourceforge.net/mail/?group_id=4993
-
-9) Legal Info
-
-The Linux DECnet project team have placed their code under the GPL. The
-software is provided "as is" and without warranty express or implied.
-DECnet is a trademark of Compaq. This software is not a product of
-Compaq. We acknowledge the help of people at Compaq in providing extra
-documentation above and beyond what was previously publicly available.
-
-Steve Whitehouse <SteveW@ACM.org>
-
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 8af3771a3ebf..5cf601c94e35 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -876,7 +876,7 @@ cipso_cache_enable - BOOLEAN
cipso_cache_bucket_size - INTEGER
The CIPSO label cache consists of a fixed size hash table with each
hash bucket containing a number of cache entries. This variable limits
- the number of entries in each hash bucket; the larger the value the
+ the number of entries in each hash bucket; the larger the value is, the
more CIPSO label mappings that can be cached. When the number of
entries in a given hash bucket reaches this limit adding new entries
causes the oldest entry in the bucket to be removed to make room.
@@ -953,7 +953,7 @@ ip_nonlocal_bind - BOOLEAN
which can be quite useful - but may break some applications.
Default: 0
-ip_dynaddr - BOOLEAN
+ip_dynaddr - INTEGER
If set non-zero, enables support for dynamic addresses.
If set to a non-zero value larger than 1, a kernel log
message will be printed when dynamic address rewriting
@@ -2284,7 +2284,14 @@ sctp_rmem - vector of 3 INTEGERs: min, default, max
Default: 4K
sctp_wmem - vector of 3 INTEGERs: min, default, max
- Currently this tunable has no effect.
+ Only the first value ("min") is used, "default" and "max" are
+ ignored.
+
+ min: Minimum size of send buffer that can be used by SCTP sockets.
+ It is guaranteed to each SCTP socket (but not association) even
+ under moderate memory pressure.
+
+ Default: 4K
addr_scope_policy - INTEGER
Control IPv4 address scoping - draft-stewart-tsvwg-sctp-ipv4-00
diff --git a/Documentation/power/runtime_pm.rst b/Documentation/power/runtime_pm.rst
index 2c2ec99b5088..78bef529464f 100644
--- a/Documentation/power/runtime_pm.rst
+++ b/Documentation/power/runtime_pm.rst
@@ -382,6 +382,12 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h:
nonzero, increment the counter and return 1; otherwise return 0 without
changing the counter
+ `int pm_runtime_get_if_active(struct device *dev, bool ign_usage_count);`
+ - return -EINVAL if 'power.disable_depth' is nonzero; otherwise, if the
+ runtime PM status is RPM_ACTIVE, and either ign_usage_count is true
+ or the device's usage_count is non-zero, increment the counter and
+ return 1; otherwise return 0 without changing the counter
+
`void pm_runtime_put_noidle(struct device *dev);`
- decrement the device's usage counter
diff --git a/Documentation/process/code-of-conduct-interpretation.rst b/Documentation/process/code-of-conduct-interpretation.rst
index e899f14a4ba2..43da2cc2e3b9 100644
--- a/Documentation/process/code-of-conduct-interpretation.rst
+++ b/Documentation/process/code-of-conduct-interpretation.rst
@@ -51,7 +51,7 @@ the Technical Advisory Board (TAB) or other maintainers if you're
uncertain how to handle situations that come up. It will not be
considered a violation report unless you want it to be. If you are
uncertain about approaching the TAB or any other maintainers, please
-reach out to our conflict mediator, Mishi Choudhary <mishi@linux.com>.
+reach out to our conflict mediator, Joanna Lee <jlee@linuxfoundation.org>.
In the end, "be kind to each other" is really what the end goal is for
everybody. We know everyone is human and we all fail at times, but the
diff --git a/Documentation/process/submitting-patches.rst b/Documentation/process/submitting-patches.rst
index fb56297f70dc..857be0d44e80 100644
--- a/Documentation/process/submitting-patches.rst
+++ b/Documentation/process/submitting-patches.rst
@@ -133,7 +133,7 @@ as you intend it to.
The maintainer will thank you if you write your patch description in a
form which can be easily pulled into Linux's source code management
-system, ``git``, as a "commit log". See :ref:`explicit_in_reply_to`.
+system, ``git``, as a "commit log". See :ref:`the_canonical_patch_format`.
Solve only one problem per patch. If your description starts to get
long, that's a sign that you probably need to split up your patch.
diff --git a/Documentation/sound/hd-audio/models.rst b/Documentation/sound/hd-audio/models.rst
index 4c91abad7b35..71ac38739d27 100644
--- a/Documentation/sound/hd-audio/models.rst
+++ b/Documentation/sound/hd-audio/models.rst
@@ -702,7 +702,7 @@ ref
no-jd
BIOS setup but without jack-detection
intel
- Intel DG45* mobos
+ Intel D*45* mobos
dell-m6-amic
Dell desktops/laptops with analog mics
dell-m6-dmic
diff --git a/Documentation/sound/soc/dapm.rst b/Documentation/sound/soc/dapm.rst
index 8e44107933ab..c3154ce6e1b2 100644
--- a/Documentation/sound/soc/dapm.rst
+++ b/Documentation/sound/soc/dapm.rst
@@ -234,7 +234,7 @@ corresponding soft power control. In this case it is necessary to create
a virtual widget - a widget with no control bits e.g.
::
- SND_SOC_DAPM_MIXER("AC97 Mixer", SND_SOC_DAPM_NOPM, 0, 0, NULL, 0),
+ SND_SOC_DAPM_MIXER("AC97 Mixer", SND_SOC_NOPM, 0, 0, NULL, 0),
This can be used to merge to signal paths together in software.
diff --git a/Documentation/sphinx/load_config.py b/Documentation/sphinx/load_config.py
index eeb394b39e2c..8b416bfd75ac 100644
--- a/Documentation/sphinx/load_config.py
+++ b/Documentation/sphinx/load_config.py
@@ -3,7 +3,7 @@
import os
import sys
-from sphinx.util.pycompat import execfile_
+from sphinx.util.osutil import fs_encoding
# ------------------------------------------------------------------------------
def loadConfig(namespace):
@@ -48,7 +48,9 @@ def loadConfig(namespace):
sys.stdout.write("load additional sphinx-config: %s\n" % config_file)
config = namespace.copy()
config['__file__'] = config_file
- execfile_(config_file, config)
+ with open(config_file, 'rb') as f:
+ code = compile(f.read(), fs_encoding, 'exec')
+ exec(code, config)
del config['__file__']
namespace.update(config)
else:
diff --git a/Documentation/trace/histogram.rst b/Documentation/trace/histogram.rst
index 3f3d1b960fe7..931f3b71745a 100644
--- a/Documentation/trace/histogram.rst
+++ b/Documentation/trace/histogram.rst
@@ -39,7 +39,7 @@ Documentation written by Tom Zanussi
will use the event's kernel stacktrace as the key. The keywords
'keys' or 'key' can be used to specify keys, and the keywords
'values', 'vals', or 'val' can be used to specify values. Compound
- keys consisting of up to two fields can be specified by the 'keys'
+ keys consisting of up to three fields can be specified by the 'keys'
keyword. Hashing a compound key produces a unique entry in the
table for each unique combination of component keys, and can be
useful for providing more fine-grained summaries of event data.
diff --git a/Documentation/translations/zh_CN/video4linux/v4l2-framework.txt b/Documentation/translations/zh_CN/video4linux/v4l2-framework.txt
index 66c7c568bd86..9c39ee58ea50 100644
--- a/Documentation/translations/zh_CN/video4linux/v4l2-framework.txt
+++ b/Documentation/translations/zh_CN/video4linux/v4l2-framework.txt
@@ -649,7 +649,7 @@ video_device注册
接下来你需要注册视频设备:这会为你创建一个字符设备。
- err = video_register_device(vdev, VFL_TYPE_GRABBER, -1);
+ err = video_register_device(vdev, VFL_TYPE_VIDEO, -1);
if (err) {
video_device_release(vdev); /* or kfree(my_vdev); */
return err;
@@ -660,7 +660,7 @@ video_device注册
注册哪种设备是根据类型(type)参数。存在以下类型:
-VFL_TYPE_GRABBER: 用于视频输入/输出设备的 videoX
+VFL_TYPE_VIDEO: 用于视频输入/输出设备的 videoX
VFL_TYPE_VBI: 用于垂直消隐数据的 vbiX (例如,隐藏式字幕,图文电视)
VFL_TYPE_RADIO: 用于广播调谐器的 radioX
diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt
index fd22224853e5..180475bcd671 100644
--- a/Documentation/virt/kvm/api.txt
+++ b/Documentation/virt/kvm/api.txt
@@ -3615,6 +3615,18 @@ Type: vm ioctl
Parameters: struct kvm_s390_cmma_log (in, out)
Returns: 0 on success, a negative value on error
+Errors:
+
+ ====== =============================================================
+ ENOMEM not enough memory can be allocated to complete the task
+ ENXIO if CMMA is not enabled
+ EINVAL if KVM_S390_CMMA_PEEK is not set but migration mode was not enabled
+ EINVAL if KVM_S390_CMMA_PEEK is not set but dirty tracking has been
+ disabled (and thus migration mode was automatically disabled)
+ EFAULT if the userspace address is invalid or if no page table is
+ present for the addresses (e.g. when using hugepages).
+ ====== =============================================================
+
This ioctl is used to get the values of the CMMA bits on the s390
architecture. It is meant to be used in two scenarios:
- During live migration to save the CMMA values. Live migration needs
@@ -3691,12 +3703,6 @@ mask is unused.
values points to the userspace buffer where the result will be stored.
-This ioctl can fail with -ENOMEM if not enough memory can be allocated to
-complete the task, with -ENXIO if CMMA is not enabled, with -EINVAL if
-KVM_S390_CMMA_PEEK is not set but migration mode was not enabled, with
--EFAULT if the userspace address is invalid or if no page table is
-present for the addresses (e.g. when using hugepages).
-
4.108 KVM_S390_SET_CMMA_BITS
Capability: KVM_CAP_S390_CMMA_MIGRATION
diff --git a/Documentation/virt/kvm/devices/vm.txt b/Documentation/virt/kvm/devices/vm.txt
index 4ffb82b02468..38ec142a4a84 100644
--- a/Documentation/virt/kvm/devices/vm.txt
+++ b/Documentation/virt/kvm/devices/vm.txt
@@ -254,6 +254,10 @@ Allows userspace to start migration mode, needed for PGSTE migration.
Setting this attribute when migration mode is already active will have
no effects.
+Dirty tracking must be enabled on all memslots, else -EINVAL is returned. When
+dirty tracking is disabled on any memslot, migration mode is automatically
+stopped.
+
Parameters: none
Returns: -ENOMEM if there is not enough free memory to start migration mode
-EINVAL if the state of the VM is invalid (e.g. no memory defined)
diff --git a/Documentation/vm/memory-model.rst b/Documentation/vm/memory-model.rst
index 58a12376b7df..94db75ba7fbe 100644
--- a/Documentation/vm/memory-model.rst
+++ b/Documentation/vm/memory-model.rst
@@ -52,8 +52,7 @@ wrapper :c:func:`free_area_init`. Yet, the mappings array is not
usable until the call to :c:func:`memblock_free_all` that hands all
the memory to the page allocator.
-If an architecture enables `CONFIG_ARCH_HAS_HOLES_MEMORYMODEL` option,
-it may free parts of the `mem_map` array that do not cover the
+An architecture may free parts of the `mem_map` array that do not cover the
actual physical pages. In such case, the architecture specific
:c:func:`pfn_valid` implementation should take the holes in the
`mem_map` into account.